Beispiel #1
0
    def arange(self,
               start_in,
               shape,
               block_shape,
               step=1,
               dtype=None) -> BlockArray:
        assert step == 1
        if dtype is None:
            dtype = np.__getattribute__(
                str(np.result_type(start_in, shape[0] + start_in)))

        # Generate ranges per block.
        grid = ArrayGrid(shape, block_shape, dtype.__name__)
        rarr = BlockArray(grid, self.cm)
        for _, grid_entry in enumerate(grid.get_entry_iterator()):
            syskwargs = {
                "grid_entry": grid_entry,
                "grid_shape": grid.grid_shape
            }
            start = start_in + block_shape[0] * grid_entry[0]
            entry_shape = grid.get_block_shape(grid_entry)
            stop = start + entry_shape[0]
            rarr.blocks[grid_entry].oid = self.cm.arange(start,
                                                         stop,
                                                         step,
                                                         dtype,
                                                         syskwargs=syskwargs)
        return rarr
Beispiel #2
0
 def from_oid(cls, oid, shape, dtype, cm):
     block_shape = shape
     grid = ArrayGrid(shape, block_shape, dtype.__name__)
     ba = BlockArray(grid, cm)
     for i, grid_entry in enumerate(grid.get_entry_iterator()):
         assert i == 0
         ba.blocks[grid_entry].oid = oid
     return ba
Beispiel #3
0
 def from_np(cls, arr, block_shape, copy, cm):
     dtype_str = str(arr.dtype)
     grid = ArrayGrid(arr.shape, block_shape, dtype_str)
     rarr = BlockArray(grid, cm)
     grid_entry_iterator = grid.get_entry_iterator()
     for grid_entry in grid_entry_iterator:
         grid_slice = grid.get_slice(grid_entry)
         block = arr[grid_slice]
         if copy:
             block = np.copy(block)
         rarr.blocks[grid_entry].oid = cm.put(block)
         rarr.blocks[grid_entry].dtype = getattr(np, dtype_str)
     return rarr
Beispiel #4
0
    def diag(self, X: BlockArray) -> BlockArray:
        if len(X.shape) == 1:
            shape = X.shape[0], X.shape[0]
            block_shape = X.block_shape[0], X.block_shape[0]
            grid = ArrayGrid(shape, block_shape, X.dtype.__name__)
            grid_meta = grid.to_meta()
            rarr = BlockArray(grid, self.cm)
            for grid_entry in grid.get_entry_iterator():
                syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
                if np.all(np.diff(grid_entry) == 0):
                    # This is a diagonal block.
                    rarr.blocks[grid_entry].oid = self.cm.diag(
                        X.blocks[grid_entry[0]].oid, 0, syskwargs=syskwargs
                    )
                else:
                    rarr.blocks[grid_entry].oid = self.cm.new_block(
                        "zeros", grid_entry, grid_meta, syskwargs=syskwargs
                    )
        elif len(X.shape) == 2:
            out_shape = (min(X.shape),)
            out_block_shape = (min(X.block_shape),)
            # Obtain the block indices which contain the diagonal of the matrix.

            diag_meta = array_utils.find_diag_output_blocks(X.blocks, out_shape[0])
            output_block_arrays = []
            out_grid_shape = (len(diag_meta),)
            count = 0
            # Obtain the diagonals.
            for block_indices, offset, total_elements in diag_meta:
                syskwargs = {"grid_entry": (count,), "grid_shape": out_grid_shape}
                result_block_shape = (total_elements,)
                block_grid = ArrayGrid(
                    result_block_shape,
                    result_block_shape,
                    X.blocks[block_indices].dtype.__name__,
                )
                block_array = BlockArray(block_grid, self.cm)
                block_array.blocks[0].oid = self.cm.diag(
                    X.blocks[block_indices].oid, offset, syskwargs=syskwargs
                )
                output_block_arrays.append(block_array)
                count += 1
            if len(output_block_arrays) > 1:
                # If there are multiple blocks, concatenate them.
                return self.concatenate(
                    output_block_arrays, axis=0, axis_block_size=out_block_shape[0]
                )
            return output_block_arrays[0]
        else:
            raise ValueError("X must have 1 or 2 axes.")
        return rarr
Beispiel #5
0
 def read_csv(self,
              filename,
              dtype=float,
              delimiter=",",
              has_header=False,
              num_workers=4):
     file_size = storage_utils.get_file_size(filename)
     file_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches(
         file_size, num_workers)
     blocks = []
     shape_oids = []
     for i, batch in enumerate(file_batches.batches):
         file_start, file_end = batch
         block_oid, shape_oid = self.cm.call(
             "read_csv_block",
             filename,
             file_start,
             file_end,
             dtype,
             delimiter,
             has_header,
             syskwargs={
                 "grid_entry": (i, ),
                 "grid_shape": (num_workers, ),
                 "options": {
                     "num_returns": 2
                 },
             },
         )
         blocks.append(block_oid)
         shape_oids.append(shape_oid)
     shapes = self.cm.get(shape_oids)
     arrays = []
     for i in range(len(shapes)):
         shape = shapes[i]
         if shape[0] == 0:
             continue
         block = blocks[i]
         grid = ArrayGrid(shape=shape,
                          block_shape=shape,
                          dtype=dtype.__name__)
         arr = BlockArray(grid, self.cm)
         iter_one = True
         for grid_entry in grid.get_entry_iterator():
             assert iter_one
             iter_one = False
             arr.blocks[grid_entry].oid = block
         arrays.append(arr)
     return arrays
Beispiel #6
0
 def empty(cls, shape, block_shape, dtype, cm: ComputeManager):
     grid = ArrayGrid(shape=shape,
                      block_shape=block_shape,
                      dtype=dtype.__name__)
     grid_meta = grid.to_meta()
     arr = BlockArray(grid, cm)
     for grid_entry in grid.get_entry_iterator():
         arr.blocks[grid_entry].oid = cm.empty(grid_entry,
                                               grid_meta,
                                               syskwargs={
                                                   "grid_entry": grid_entry,
                                                   "grid_shape":
                                                   grid.grid_shape
                                               })
     return arr
Beispiel #7
0
 def _new_array(
     self, op_name: str, shape: tuple, block_shape: tuple, dtype: np.dtype = None
 ):
     assert len(shape) == len(block_shape)
     if dtype is None:
         dtype = np.float64
     grid = ArrayGrid(shape, block_shape, dtype.__name__)
     grid_meta = grid.to_meta()
     rarr = BlockArray(grid, self.cm)
     for grid_entry in grid.get_entry_iterator():
         rarr.blocks[grid_entry].oid = self.cm.new_block(
             op_name,
             grid_entry,
             grid_meta,
             syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape},
         )
     return rarr
Beispiel #8
0
    def map_uop(
        self,
        op_name: str,
        arr: BlockArray,
        out: BlockArray = None,
        where=True,
        args=None,
        kwargs=None,
    ) -> BlockArray:
        """A map, for unary operators, that applies to every entry of an array.

        Args:
            op_name: An element-wise unary operator.
            arr: A BlockArray.
            out: A BlockArray to which the result is written.
            where: An indicator specifying the indices to which op is applied.
            args: Args provided to op.
            kwargs: Keyword args provided to op.

        Returns:
            A BlockArray.
        """
        if where is not True:
            raise NotImplementedError("'where' argument is not yet supported.")
        args = () if args is None else args
        kwargs = {} if kwargs is None else kwargs
        shape = arr.shape
        block_shape = arr.block_shape
        dtype = array_utils.get_uop_output_type(op_name, arr.dtype)
        assert len(shape) == len(block_shape)
        if out is None:
            grid = ArrayGrid(shape, block_shape, dtype.__name__)
            rarr = BlockArray(grid, self.cm)
        else:
            rarr = out
            grid = rarr.grid
            assert rarr.shape == arr.shape and rarr.block_shape == arr.block_shape
        for grid_entry in grid.get_entry_iterator():
            # TODO(hme): Faster to create ndarray first,
            #  and instantiate block array on return
            #  to avoid instantiating blocks on BlockArray initialization.
            rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map(
                op_name, args=args, kwargs=kwargs
            )
        return rarr
Beispiel #9
0
 def _group_index_lists_by_block(self, dst_slice_tuples,
                                 src_grid: ArrayGrid, dst_index_list,
                                 src_index_list):
     # TODO(hme): Keep this function here until it's needed for greater support of
     #  selection/assignment operations.
     # Block grid entries needed to write to given dst_slice_selection.
     src_blocks = {}
     dst_slice_np = np.array(dst_slice_tuples).T
     dst_index_arr = np.array(dst_index_list)
     src_index_arr = np.array(src_index_list)
     # Pick the smallest type to represent indices.
     # A set of these indices may be transmitted over the network,
     # so we want to pick the smallest encoding possible.
     index_types = [
         (2**8, np.uint8),
         (2**16, np.uint16),
         (2**32, np.uint32),
         (2**64, np.uint64),
     ]
     index_type = None
     for bound, curr_index_type in index_types:
         if np.all(np.array(src_grid.block_shape) < bound) and np.all(
                 dst_slice_np[1] < bound):
             index_type = curr_index_type
             break
     if index_type is None:
         raise Exception(
             "Unable to encode block indices, blocks are too large.")
     for grid_entry in src_grid.get_entry_iterator():
         src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T
         index_pairs = []
         for i in range(src_index_arr.shape[0]):
             src_index = src_index_arr[i]
             dst_index = dst_index_arr[i]
             if np.all((src_slice_np[0] <= src_index)
                       & (src_index < src_slice_np[1])):
                 index_pair = (
                     (dst_index - dst_slice_np[0]).astype(index_type),
                     (src_index - src_slice_np[0]).astype(index_type),
                 )
                 index_pairs.append(index_pair)
         if len(index_pairs) > 0:
             src_blocks[grid_entry] = index_pairs
     return src_blocks
Beispiel #10
0
 def eye(self, shape: tuple, block_shape: tuple, dtype: np.dtype = None):
     assert len(shape) == len(block_shape) == 2
     if dtype is None:
         dtype = np.float64
     grid = ArrayGrid(shape, block_shape, dtype.__name__)
     grid_meta = grid.to_meta()
     rarr = BlockArray(grid, self.cm)
     for grid_entry in grid.get_entry_iterator():
         syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
         if np.all(np.diff(grid_entry) == 0):
             # This is a diagonal block.
             rarr.blocks[grid_entry].oid = self.cm.new_block(
                 "eye", grid_entry, grid_meta, syskwargs=syskwargs
             )
         else:
             rarr.blocks[grid_entry].oid = self.cm.new_block(
                 "zeros", grid_entry, grid_meta, syskwargs=syskwargs
             )
     return rarr
Beispiel #11
0
 def diag(self, X: BlockArray) -> BlockArray:
     if len(X.shape) == 1:
         shape = X.shape[0], X.shape[0]
         block_shape = X.block_shape[0], X.block_shape[0]
         grid = ArrayGrid(shape, block_shape, X.dtype.__name__)
         grid_meta = grid.to_meta()
         rarr = BlockArray(grid, self.cm)
         for grid_entry in grid.get_entry_iterator():
             syskwargs = {
                 "grid_entry": grid_entry,
                 "grid_shape": grid.grid_shape
             }
             if np.all(np.diff(grid_entry) == 0):
                 # This is a diagonal block.
                 rarr.blocks[grid_entry].oid = self.cm.diag(
                     X.blocks[grid_entry[0]].oid, syskwargs=syskwargs)
             else:
                 rarr.blocks[grid_entry].oid = self.cm.new_block(
                     "zeros", grid_entry, grid_meta, syskwargs=syskwargs)
     elif len(X.shape) == 2:
         assert X.shape[0] == X.shape[1], "X must be a square array."
         assert X.block_shape[0] == X.block_shape[
             1], "block_shape must be square."
         shape = X.shape[0],
         block_shape = X.block_shape[0],
         grid = ArrayGrid(shape, block_shape, X.dtype.__name__)
         rarr = BlockArray(grid, self.cm)
         for grid_entry in X.grid.get_entry_iterator():
             out_grid_entry = grid_entry[:1]
             out_grid_shape = grid.grid_shape[:1]
             syskwargs = {
                 "grid_entry": out_grid_entry,
                 "grid_shape": out_grid_shape
             }
             if np.all(np.diff(grid_entry) == 0):
                 # This is a diagonal block.
                 rarr.blocks[out_grid_entry].oid = self.cm.diag(
                     X.blocks[grid_entry].oid, syskwargs=syskwargs)
     else:
         raise ValueError("X must have 1 or 2 axes.")
     return rarr
Beispiel #12
0
 def from_blocks(cls, arr: np.ndarray, result_shape, cm):
     sample_idx = tuple(0 for dim in arr.shape)
     if isinstance(arr, Block):
         sample_block = arr
         result_shape = ()
     else:
         sample_block = arr[sample_idx]
         if result_shape is None:
             result_shape = array_utils.shape_from_block_array(arr)
     result_block_shape = sample_block.shape
     result_dtype_str = sample_block.dtype.__name__
     result_grid = ArrayGrid(shape=result_shape,
                             block_shape=result_block_shape,
                             dtype=result_dtype_str)
     assert arr.shape == result_grid.grid_shape
     result = BlockArray(result_grid, cm)
     for grid_entry in result_grid.get_entry_iterator():
         if isinstance(arr, Block):
             block: Block = arr
         else:
             block: Block = arr[grid_entry]
         result.blocks[grid_entry] = block
     return result
Beispiel #13
0
    def _simple_reshape(self, arr, shape, block_shape):
        # Reshape the array of blocks only.
        # This is only used when the difference in shape are factors of 1s,
        # and the ordering of other factors are maintained.

        # Check assumptions.
        assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape))

        # Create new grid, and perform reshape on blocks
        # to simplify access to source blocks.
        grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__)
        src_blocks = arr.blocks.reshape(grid.grid_shape)
        rarr = BlockArray(grid, arr.cm)
        for grid_entry in grid.get_entry_iterator():
            src_block: Block = src_blocks[grid_entry]
            dst_block: Block = rarr.blocks[grid_entry]
            syskwargs = {
                "grid_entry": grid_entry,
                "grid_shape": grid.grid_shape
            }
            dst_block.oid = arr.cm.reshape(src_block.oid,
                                           dst_block.shape,
                                           syskwargs=syskwargs)
        return rarr
Beispiel #14
0
 def loadtxt(self,
             fname,
             dtype=float,
             comments='# ',
             delimiter=' ',
             converters=None,
             skiprows=0,
             usecols=None,
             unpack=False,
             ndmin=0,
             encoding='bytes',
             max_rows=None,
             num_workers=4) -> BlockArray:
     # pylint: disable=unused-variable
     bytes_per_char, bytes_per_row, bytes_per_col, num_cols = storage_utils.get_np_txt_info(
         fname, comments, delimiter)
     chars_per_row = bytes_per_row // bytes_per_char
     assert np.allclose(float(chars_per_row),
                        bytes_per_row / bytes_per_char)
     comment_lines, trailing_newlines = storage_utils.get_np_comments(
         fname, comments)
     nonrow_chars = trailing_newlines
     for line in comment_lines:
         nonrow_chars += len(line)
     file_size = storage_utils.get_file_size(fname)
     file_chars = file_size // bytes_per_char
     assert np.allclose(float(file_chars), file_size / bytes_per_char)
     row_chars = file_chars - nonrow_chars
     num_rows = row_chars // chars_per_row
     assert np.allclose(float(num_rows), float(row_chars / chars_per_row))
     num_rows_final = num_rows - skiprows
     if max_rows is not None:
         num_rows_final = (num_rows_final, max_rows)
     row_batches: storage_utils.Batch = storage_utils.Batch.from_num_batches(
         num_rows_final, num_workers)
     grid = ArrayGrid(
         shape=(num_rows_final, num_cols),
         block_shape=(row_batches.batch_size, num_cols),
         dtype=np.float64.__name__ if dtype is float else dtype.__name__)
     result: BlockArray = BlockArray(grid, cm=self.cm)
     for i, grid_entry in enumerate(grid.get_entry_iterator()):
         row_start, row_end = row_batches.batches[i]
         batch_skiprows = skiprows + row_start
         batch_max_rows = grid.get_block_shape(grid_entry)[0]
         assert batch_max_rows == row_end - row_start
         result.blocks[grid_entry].oid = self.loadtxt_block(
             fname,
             dtype=dtype,
             comments=comments,
             delimiter=delimiter,
             converters=converters,
             skiprows=batch_skiprows,
             usecols=usecols,
             unpack=unpack,
             ndmin=ndmin,
             encoding=encoding,
             max_rows=batch_max_rows,
             syskwargs={
                 "grid_entry": grid_entry,
                 "grid_shape": grid.grid_shape
             })
     return result
Beispiel #15
0
    def diag(self, X: BlockArray) -> BlockArray:
        def find_diag_output_blocks(X: BlockArray, total_elements: int):
            # The i,j entry corresponding to a block in X_blocks.
            block_i, block_j = 0, 0

            # The i,j entry within the current block.
            element_i, element_j = 0, 0

            # Keep track of the no of elements found so far.
            count = 0

            # Start at block 0,0.
            block = X.blocks[(0, 0)]

            # Each element contains block indices, diag offset,
            # and the total elements required from the block.
            diag_meta = []

            while count < total_elements:
                if element_i > block.shape[0] - 1:
                    block_i = block_i + 1
                    element_i = 0
                if element_j > block.shape[1] - 1:
                    block_j = block_j + 1
                    element_j = 0

                block = X.blocks[(block_i, block_j)]
                block_rows, block_cols = block.shape[0], block.shape[1]
                offset = -element_i if element_i > element_j else element_j
                total_elements_block = (min(block_rows - 1 - element_i,
                                            block_cols - 1 - element_j) + 1)
                diag_meta.append(
                    ((block_i, block_j), offset, total_elements_block))
                count, element_i = (
                    count + total_elements_block,
                    element_i + total_elements_block,
                )
                element_j = element_j + total_elements_block
            return diag_meta

        if len(X.shape) == 1:
            shape = X.shape[0], X.shape[0]
            block_shape = X.block_shape[0], X.block_shape[0]
            grid = ArrayGrid(shape, block_shape, X.dtype.__name__)
            grid_meta = grid.to_meta()
            rarr = BlockArray(grid, self.cm)
            for grid_entry in grid.get_entry_iterator():
                syskwargs = {
                    "grid_entry": grid_entry,
                    "grid_shape": grid.grid_shape
                }
                if np.all(np.diff(grid_entry) == 0):
                    # This is a diagonal block.
                    rarr.blocks[grid_entry].oid = self.cm.diag(
                        X.blocks[grid_entry[0]].oid, 0, syskwargs=syskwargs)
                else:
                    rarr.blocks[grid_entry].oid = self.cm.new_block(
                        "zeros", grid_entry, grid_meta, syskwargs=syskwargs)
        elif len(X.shape) == 2:
            out_shape = (min(X.shape), )
            out_block_shape = (min(X.block_shape), )
            # Obtain the block indices which contain the diagonal of the matrix.

            diag_meta = find_diag_output_blocks(X, out_shape[0])
            output_block_arrays = []
            out_grid_shape = (len(diag_meta), )
            count = 0
            # Obtain the diagonals.
            for block_indices, offset, total_elements in diag_meta:
                syskwargs = {
                    "grid_entry": (count, ),
                    "grid_shape": out_grid_shape
                }
                result_block_shape = (total_elements, )
                block_grid = ArrayGrid(
                    result_block_shape,
                    result_block_shape,
                    X.blocks[block_indices].dtype.__name__,
                )
                block_array = BlockArray(block_grid, self.cm)
                block_array.blocks[0].oid = self.cm.diag(
                    X.blocks[block_indices].oid, offset, syskwargs=syskwargs)
                output_block_arrays.append(block_array)
                count += 1
            if len(output_block_arrays) > 1:
                # If there are multiple blocks, concatenate them.
                return self.concatenate(output_block_arrays,
                                        axis=0,
                                        axis_block_size=out_block_shape[0])
            return output_block_arrays[0]
        else:
            raise ValueError("X must have 1 or 2 axes.")
        return rarr
Beispiel #16
0
    def reduce_axis(self, op_name, axis, keepdims=False):
        if not (axis is None or isinstance(axis, (int, np.int32, np.int64))):
            raise NotImplementedError(
                "Only integer axis is currently supported.")
        block_reduced_oids = np.empty_like(self.blocks, dtype=tuple)
        for grid_entry in self.grid.get_entry_iterator():
            block = self.blocks[grid_entry]
            block_oid = self.cm.reduce_axis(
                op_name=op_name,
                arr=block.oid,
                axis=axis,
                keepdims=keepdims,
                transposed=block.transposed,
                syskwargs={
                    "grid_entry": block.grid_entry,
                    "grid_shape": block.grid_shape,
                },
            )
            block_reduced_oids[grid_entry] = (
                block_oid,
                block.grid_entry,
                block.grid_shape,
                False,
            )
        result_shape = []
        result_block_shape = []
        for curr_axis in range(len(self.shape)):
            axis_size, axis_block_size = (
                self.shape[curr_axis],
                self.block_shape[curr_axis],
            )
            if curr_axis == axis or axis is None:
                if keepdims:
                    axis_size, axis_block_size = 1, 1
                else:
                    continue
            result_shape.append(axis_size)
            result_block_shape.append(axis_block_size)
        result_shape = tuple(result_shape)
        result_block_shape = tuple(result_block_shape)
        result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype)
        result_grid = ArrayGrid(
            shape=result_shape,
            block_shape=result_block_shape,
            dtype=result_dtype.__name__,
        )
        result = BlockArray(result_grid, self.cm)

        if axis is None:
            if result.shape == ():
                result_block: Block = result.blocks[()]
            else:
                result_block: Block = result.blocks[:].item()
            result_block.oid = self._tree_reduce(
                op_name,
                block_reduced_oids.flatten().tolist(),
                result_block.grid_entry,
                result_block.grid_shape,
            )
        else:
            for result_grid_entry in result_grid.get_entry_iterator():
                block_reduced_oids_axis = []
                for sum_dim in range(self.grid.grid_shape[axis]):
                    grid_entry = list(result_grid_entry)
                    if keepdims:
                        grid_entry[axis] = sum_dim
                    else:
                        grid_entry = grid_entry[:axis] + [sum_dim
                                                          ] + grid_entry[axis:]
                    grid_entry = tuple(grid_entry)
                    block_reduced_oids_axis.append(
                        block_reduced_oids[grid_entry])
                result_block: Block = result.blocks[result_grid_entry]
                result_block.oid = self._tree_reduce(
                    op_name,
                    block_reduced_oids_axis,
                    result_block.grid_entry,
                    result_block.grid_shape,
                )
        return result