Ejemplo n.º 1
0
def write_block_s3(block: Any, filename: AnyStr, grid_entry: Tuple,
                   grid_meta: Dict):
    return np.array(
        StoredArrayS3(filename,
                      ArrayGrid.from_meta(grid_meta)).put(grid_entry, block),
        dtype=dict,
    )
Ejemplo n.º 2
0
 def get_grid(self) -> ArrayGrid:
     try:
         response = self.client.get_object(Bucket=self.container_name,
                                           Key=self.get_meta_key())
         meta_dict = pickle.loads(response["Body"].read())
         return ArrayGrid.from_meta(meta_dict)
     except Exception as _:
         return None
Ejemplo n.º 3
0
 def new_block(self, op_name, grid_entry, grid_meta):
     op_func = np.__getattribute__(op_name)
     grid = ArrayGrid.from_meta(grid_meta)
     block_shape = grid.get_block_shape(grid_entry)
     if op_name == "eye":
         assert np.all(np.diff(grid_entry) == 0)
         return op_func(*block_shape, dtype=grid.dtype)
     else:
         return op_func(block_shape, dtype=grid.dtype)
Ejemplo n.º 4
0
 def read_fs(self, filename: str):
     meta = self._fs.read_meta_fs(filename)
     addresses = meta["addresses"]
     grid_meta = meta["grid_meta"]
     grid = ArrayGrid.from_meta(grid_meta)
     ba: BlockArray = BlockArray(grid, self.cm)
     for grid_entry in addresses:
         device_id: DeviceID = DeviceID.from_str(addresses[grid_entry])
         ba.blocks[grid_entry].oid = self._fs.read_block_fs(
             filename,
             grid_entry,
             grid_meta,
             syskwargs={"device_id": device_id})
     return ba
Ejemplo n.º 5
0
 def __getattr__(self, item):
     if item == "__array_priority__" or item == "__array_struct__":
         # This is triggered by a numpy array on the LHS.
         raise ValueError("Unable to covert numpy array to block array.")
     elif item == "ndim":
         return len(self.shape)
     elif item == "T":
         metaT = self.grid.to_meta()
         metaT["shape"] = tuple(reversed(metaT["shape"]))
         metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
         gridT = ArrayGrid.from_meta(metaT)
         rarrT = BlockArray(gridT, self.cm)
         rarrT.blocks = np.copy(self.blocks.T)
         for grid_entry in rarrT.grid.get_entry_iterator():
             rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose()
         return rarrT
     else:
         raise NotImplementedError(item)
Ejemplo n.º 6
0
 def delete_fs(self, filename: str):
     meta = self._fs.read_meta_fs(filename)
     addresses = meta["addresses"]
     grid_meta = meta["grid_meta"]
     grid = ArrayGrid.from_meta(grid_meta)
     result_grid = ArrayGrid(grid.grid_shape,
                             tuple(np.ones_like(grid.shape, dtype=np.int)),
                             dtype=dict.__name__)
     rarr = BlockArray(result_grid, self.cm)
     for grid_entry in addresses:
         device_id: DeviceID = DeviceID.from_str(addresses[grid_entry])
         rarr.blocks[grid_entry].oid = self._fs.delete_block_fs(
             filename,
             grid_entry,
             grid_meta,
             syskwargs={"device_id": device_id})
     self._fs.delete_meta_fs(filename)
     return rarr
Ejemplo n.º 7
0
    def swapaxes(self, axis1, axis2):
        meta_swap = self.grid.to_meta()
        shape = list(meta_swap["shape"])
        block_shape = list(meta_swap["block_shape"])
        dim = len(shape)
        if axis1 >= dim or axis2 >= dim:
            raise ValueError("axis is larger than the array dimension")
        shape[axis1], shape[axis2] = shape[axis2], shape[axis1]
        block_shape[axis1], block_shape[axis2] = block_shape[
            axis2], block_shape[axis1]
        meta_swap["shape"] = tuple(shape)
        meta_swap["block_shape"] = tuple(block_shape)
        grid_swap = ArrayGrid.from_meta(meta_swap)
        rarr_src = np.ndarray(self.blocks.shape, dtype="O")

        for grid_entry in self.grid.get_entry_iterator():
            rarr_src[grid_entry] = self.blocks[grid_entry].swapaxes(
                axis1, axis2)
        rarr_src = rarr_src.swapaxes(axis1, axis2)

        rarr_swap = BlockArray(grid_swap, self.cm, rarr_src)
        return rarr_swap
Ejemplo n.º 8
0
def get_parts_fs(filename: AnyStr, grid_meta: Dict):
    base: pathlib.Path = pathlib.Path(filename)
    if not base.is_dir():
        return None
    results = []
    grid: ArrayGrid = ArrayGrid.from_meta(grid_meta)
    # This is a multi-dimensional array of blocks, so entries should be relatively small.
    assert np.all(np.array(grid.block_shape) < 2**32)
    contains_all = True
    for grid_entry in grid.get_entry_iterator():
        entry_name = "_".join(list(map(str,
                                       grid_entry))) + "." + ARRAY_FILETYPE
        entry_filename = settings.pj(filename, entry_name)
        if pathlib.Path(entry_filename).is_file():
            results.append(grid_entry)
        else:
            contains_all = False
    if contains_all:
        return "all"
    else:
        if len(results) == 0:
            return None
        else:
            return np.array(results, dtype=np.uint32)
Ejemplo n.º 9
0
def delete_block_s3(filename: AnyStr, grid_entry: Tuple, grid_meta: Dict):
    return np.array(StoredArrayS3(
        filename, ArrayGrid.from_meta(grid_meta)).delete(grid_entry),
                    dtype=dict)
Ejemplo n.º 10
0
def read_block_s3(filename: AnyStr, grid_entry: Tuple, grid_meta: Dict):
    return StoredArrayS3(filename,
                         ArrayGrid.from_meta(grid_meta)).get(grid_entry)
Ejemplo n.º 11
0
def write_meta_s3(filename: AnyStr, grid_meta: Dict):
    sa: StoredArrayS3 = StoredArrayS3(filename, ArrayGrid.from_meta(grid_meta))
    return np.array(sa.put_grid(sa.grid), dtype=dict)
Ejemplo n.º 12
0
    def read_array_fs(self, filename: AnyStr):
        file_meta: dict = self.read_meta_fs(filename)
        # Currently, file_meta contains only ArrayGrid.
        grid: ArrayGrid = ArrayGrid.from_meta(file_meta["grid"])
        # First, let's identify which nodes actually contain the data we need.
        result_oids = []
        for device_id in self.cm.devices():
            oid = self.cm.call(
                "get_parts_fs",
                filename,
                file_meta["grid"],
                syskwargs={"device_id": device_id},
            )
            result_oids.append(oid)
        file_results = self.cm.system.get(result_oids)

        # Check if all the nodes have all the data.
        all_has_all = True
        for result in file_results:
            if isinstance(result, str) and result != "all":
                all_has_all = False
                break
        if all_has_all:
            # This is likely a single machine or virtual FS.
            # Load via device grid ordering.
            ba: BlockArray = BlockArray(grid, self.cm)
            for grid_entry in grid.get_entry_iterator():
                device_id: DeviceID = self.cm.device_grid.get_device_id(
                    grid_entry, grid.grid_shape)
                ba.blocks[grid_entry].oid = self.read_block_fs(
                    filename,
                    grid_entry,
                    file_meta["grid"],
                    syskwargs={"device_id": device_id},
                )
            return ba

        # Organize data for reads.
        grid_entry_sets = {}
        for i in range(len(file_results)):
            node_grid_entries: Union[None, np.ndarray] = file_results[i]
            if node_grid_entries is None:
                continue
            device_id = self.cm.devices()[i]
            grid_entry_sets[device_id] = set(
                map(tuple, node_grid_entries.tolist()))

        # The data may be partitioned according to the grid layout for this cluster.
        # Test this and load accordingly if it is.
        aligned = True
        for grid_entry in grid.get_entry_iterator():
            device_id = self.cm.device_grid.get_device_id(
                grid_entry, grid.grid_shape)
            if not (device_id in grid_entry_sets
                    and grid_entry in grid_entry_sets[device_id]):
                aligned = False
                break
        if aligned:
            # If data is partitioning aligned, then just load it using device grid ordering.
            ba: BlockArray = BlockArray(grid, self.cm)
            for grid_entry in grid.get_entry_iterator():
                device_id: DeviceID = self.cm.device_grid.get_device_id(
                    grid_entry, grid.grid_shape)
                ba.blocks[grid_entry].oid = self.read_block_fs(
                    filename,
                    grid_entry,
                    file_meta["grid"],
                    syskwargs={"device_id": device_id},
                )
            return ba

        # This is the worst-case scenario. Make sure we have all blocks.
        grid_entry_to_devices = {}
        for grid_entry in grid.get_entry_iterator():
            grid_entry_to_devices[grid_entry] = []
            for device_id in grid_entry_sets:
                if grid_entry in grid_entry_sets[device_id]:
                    grid_entry_to_devices[grid_entry].append(device_id)
            if len(grid_entry_to_devices[grid_entry]) == 0:
                raise Exception("Unable to find all blocks for %s." % filename)

        warnings.warn(("Loading %s with no data layout guarantee. " %
                       filename) + "This may negatively impact performance. " +
                      "To fix this, rewrite this block array to disk.")
        ba: BlockArray = BlockArray(grid, self.cm)
        for grid_entry in grid_entry_to_devices:
            # Distribute load of read operations randomly over available nodes.
            device_id = np.random.choice(grid_entry_to_devices[grid_entry])
            # Schedule the load operation.
            ba.blocks[grid_entry].oid = self.read_block_fs(
                filename,
                grid_entry,
                file_meta["grid"],
                syskwargs={"device_id": device_id},
            )
        # The blocks are likely not distributed properly,
        # but any operations performed on this block array
        # will force the appropriate placement.
        # We could alternatively invoke touch() on the array,
        # but this would block until the data is loaded.
        return ba
Ejemplo n.º 13
0
 def empty(self, grid_entry, grid_meta):
     grid = ArrayGrid.from_meta(grid_meta)
     block_shape = grid.get_block_shape(grid_entry)
     return np.empty(block_shape, dtype=grid.dtype)