def __init__(self, filename: str, mode: str = 'r+') -> None: """ Establish a connection to a .loom file. Args: filename: Name of the .loom file to open mode: read/write mode, accepts 'r+' (read/write) or 'r' (read-only), defaults to 'r+' without arguments, and to 'r' with incorrect arguments Returns: Nothing. """ # make sure a valid mode was passed, if not default to read-only # because you probably are doing something that you don't want to if mode != 'r+' and mode != 'r': logging.warn( "Wrong mode passed to LoomConnection, using read-only") mode = 'r' self.mode = mode self.filename = filename self._file = h5py.File(filename, mode) self._closed = False try: if "matrix" in self._file: self.shape = self._file["/matrix"].shape else: self.shape = (0, 0) self.layers = loompy.LayerManager(self) self.view = loompy.ViewManager(self) self.ra = loompy.AttributeManager(self, axis=0) self.ca = loompy.AttributeManager(self, axis=1) self.attrs = loompy.FileAttributeManager(self._file) self.row_graphs = loompy.GraphManager(self, axis=0) self.col_graphs = loompy.GraphManager(self, axis=1) # Compatibility self.layer = self.layers self.row_attrs = self.ra self.col_attrs = self.ca except Exception as e: logging.warn( "initialising LoomConnection to %s failed, closing file connection", filename) self.close() raise e
def scan(self, *, items: np.ndarray = None, axis: int = None, layers: Iterable = None, key: str = None, batch_size: int = 8 * 64) -> Iterable[Tuple[int, np.ndarray, loompy.LoomView]]: """ Scan across one axis and return batches of rows (columns) as LoomView objects Args ---- items: np.ndarray the indexes [0, 2, 13, ... ,973] of the rows/cols to include along the axis OR: boolean mask array giving the rows/cols to include axis: int 0:rows or 1:cols batch_size: int the chuncks returned at every element of the iterator layers: iterable if specified it will batch scan only across some of the layers of the loom file if layers == None, all layers will be scanned if layers == [""] or "", only the default layer will be scanned key: Name of primary key attribute. If specified, return the values sorted by the key Returns ------ Iterable that yields triplets (ix, indexes, view) ix: int first position / how many rows/cols have been yielded alredy indexes: np.ndarray[int] the indexes with the same numbering of the input args cells / genes (i.e. np.arange(len(ds.shape[axis]))) this is ix + selection view: LoomView a view corresponding to the current chunk """ if axis is None: raise ValueError("Axis must be given (0 = rows, 1 = cols)") if layers is None: layers = self.layers.keys() if layers == "": layers = [""] if (items is not None) and (np.issubdtype(items.dtype, np.bool_)): items = np.where(items)[0] ordering: np.ndarray = None vals: Dict[str, loompy.MemoryLoomLayer] = {} if axis == 1: if key is not None: ordering = np.argsort(self.ra[key]) else: ordering = np.arange(self.shape[0]) if items is None: items = np.fromiter(range(self.shape[1]), dtype='int') cols_per_chunk = batch_size ix = 0 while ix < self.shape[1]: cols_per_chunk = min(self.shape[1] - ix, cols_per_chunk) selection = items - ix # Pick out the cells that are in this batch selection = selection[np.where(np.logical_and(selection >= 0, selection < cols_per_chunk))[0]] if selection.shape[0] == 0: ix += cols_per_chunk continue # Load the whole chunk from the file, then extract genes and cells using fancy indexing for layer in layers: temp = self.layers[layer][:, ix:ix + cols_per_chunk] temp = temp[ordering, :] temp = temp[:, selection] vals[layer] = loompy.MemoryLoomLayer(layer, temp) lm = loompy.LayerManager(None) for key, layer in vals.items(): lm[key] = loompy.MemoryLoomLayer(key, layer) view = loompy.LoomView(lm, self.ra[ordering], self.ca[ix + selection], self.row_graphs[ordering], self.col_graphs[ix + selection], filename=self.filename, file_attrs=self.attrs) yield (ix, ix + selection, view) ix += cols_per_chunk elif axis == 0: if key is not None: ordering = np.argsort(self.ca[key]) else: ordering = np.arange(self.shape[1]) if items is None: items = np.fromiter(range(self.shape[0]), dtype='int') rows_per_chunk = batch_size ix = 0 while ix < self.shape[0]: rows_per_chunk = min(self.shape[0] - ix, rows_per_chunk) selection = items - ix # Pick out the genes that are in this batch selection = selection[np.where(np.logical_and(selection >= 0, selection < rows_per_chunk))[0]] if selection.shape[0] == 0: ix += rows_per_chunk continue # Load the whole chunk from the file, then extract genes and cells using fancy indexing for layer in layers: temp = self.layers[layer][ix:ix + rows_per_chunk, :] temp = temp[:, ordering] temp = temp[selection, :] vals[layer] = loompy.MemoryLoomLayer(layer, temp) lm = loompy.LayerManager(None) for key, layer in vals.items(): lm[key] = loompy.MemoryLoomLayer(key, layer) view = loompy.LoomView(lm, self.ra[ix + selection], self.ca[ordering], self.row_graphs[ix + selection], self.col_graphs[ordering], filename=self.filename, file_attrs=self.attrs) yield (ix, ix + selection, view) ix += rows_per_chunk else: raise ValueError("axis must be 0 or 1")