def read_loom(filename: PathLike, sparse: bool = False) -> AnnData: """Read ``.loom``-formatted hdf5 file. This reads the whole file into memory. Beware that you have to explicitly state when you want to read the file as sparse data. Parameters ---------- filename The filename. sparse Whether to read the data matrix as sparse. """ filename = fspath(filename) # allow passing pathlib.Path objects from loompy import connect if sparse: with connect(filename, 'r') as lc: X = lc.sparse() else: with h5py.File(filename, 'r') as f: X = f['matrix'][()] with connect(filename, 'r') as lc: adata = AnnData( X.T, obs=dict(lc.col_attrs), # not ideal: make the generator a dict... var=dict(lc.row_attrs)) lc.close() return adata
def get_attr_index(loom_file, attr=None, columns=False, as_bool=True, inverse=False): """ Gets index for desired attributes in a loom file Args: loom_file (str): Path to loom file attr (str): Optional, attribute used to restrict index If None, all elements are included columns (boolean): Specifies if pulling rows or columns True: column attributes False: row attributes as_bool (bool): Return as boolean (true) or numerical (false) array inverse (bool): If true, returns inverse of index All trues are false, all falses are true Returns: idx (1D array): Index of attributes to use boolean if as_bool, numerical if not as_bool Assumptions: attr specifies a boolean array attribute in loom_file """ with loompy.connect(filename=loom_file, mode='r') as ds: if columns: if attr: idx = ds.ca[attr].astype(bool) else: idx = np.ones((ds.shape[1], ), dtype=bool) else: if attr: idx = ds.ra[attr].astype(bool) else: idx = np.ones((ds.shape[0], ), dtype=bool) if inverse: idx = np.logical_not(idx) if as_bool: pass elif idx.ndim == 1: idx = np.where(idx)[0] else: raise ValueError('idx must be one dimensional') return idx
def load_exp_matrix_as_loom( fname, attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER, attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame: """ Load expression matrix from loom file. :param fname: The name of the loom file to load. :return: A 2-dimensional dataframe (rows = cells x columns = genes). """ with lp.connect(fname, mode='r', validate=False) as ds: # The orientation of the loom file is always: # - Columns represent cells or aggregates of cells # - Rows represent genes return pd.DataFrame(data=ds[:, :], index=ds.ra[attribute_name_gene], columns=ds.ca[attribute_name_cell_id]).T
def load_loom_file(input_loom: str, genome: str, ngene: int = None) -> "MemData": """Load count matrix from a LOOM file. Currently only support HCA DCP Loom spec. Parameters ---------- input_loom : `str` The LOOM file, containing the count matrix. genome : `str` The genome reference. ngene : `int`, optional (default: None) Minimum number of genes to keep a barcode. Default is to keep all barcodes. Returns ------- An MemData object containing a genome-Array2D pair. Examples -------- >>> io.load_loom_file('example.loom', genome = 'GRCh38', ngene = 200) """ import loompy col_trans = {"CellID": "barcodekey"} row_trans = {"Accession": "featurekey", "Gene": "featurename"} data = MemData() with loompy.connect(input_loom) as ds: mat = csr_matrix(ds.sparse().T) barcode_metadata = {} for keyword, values in ds.col_attrs.items(): keyword = col_trans.get(keyword, keyword) barcode_metadata[keyword] = values feature_metadata = {} for keyword, values in ds.row_attrs.items(): keyword = row_trans.get(keyword, keyword) feature_metadata[keyword] = values array2d = Array2D(barcode_metadata, feature_metadata, mat) array2d.filter(ngene=ngene) data.addData(genome, array2d) return data
def _load_loom(path_to_file: str, gene_names_attribute_name: str = "Gene") -> AnnData: import loompy dataset = loompy.connect(path_to_file) select = dataset[:, :].sum( axis=0) > 0 # Take out cells that don't express any gene if not all(select): warnings.warn("Removing empty cells") var_dict, obs_dict, uns_dict, obsm_dict = {}, {}, {}, {} for row_key in dataset.ra: if row_key == gene_names_attribute_name: gene_names = dataset.ra[gene_names_attribute_name].astype(str) else: var_dict[row_key] = dataset.ra[row_key] if type(var_dict[row_key]) is np.ndarray: var_dict[row_key] = var_dict[row_key].ravel() for column_key in dataset.ca: obs_dict = obs_dict if obs_dict is not None else {} obs_dict[column_key] = dataset.ca[column_key][select] if type(obs_dict[column_key]) is np.ndarray: if len(obs_dict[column_key]) == len(obs_dict[column_key].ravel()): obs_dict[column_key] = obs_dict[column_key].ravel() else: obsm_dict[column_key] = obs_dict[column_key] del obs_dict[column_key] for global_key in dataset.attrs: uns_dict = uns_dict if uns_dict is not None else {} uns_dict[global_key] = dataset.attrs[global_key] if type(uns_dict[global_key]) is np.ndarray: uns_dict[global_key] = uns_dict[global_key].ravel() data = dataset[:, :].T # change matrix to cells by genes dataset.close() adata = AnnData(X=data, obs=obs_dict, var=var_dict, uns=uns_dict, obsm=obsm_dict) adata = adata[select].copy() adata.var_names = gene_names return adata
def main(): loom_in_file = sys.argv[1] loom_out_file = sys.argv[2] copy2(loom_in_file, loom_out_file) loom_out = lp.connect(loom_out_file) # get sra ids new_features = get_sra(loom_out) # adds them to file loom_out.ca.insdc_run_accessions = new_features["insdc_run_accessions"] loom_out.ca.file_name = new_features["file_name"] loom_out.close()
def run_mcmc(loomfile, model, hapcode, start, end, outfile): LOG.warn('Quantifying allele-specific expression in each cell') LOG.info('Level-1 verbose is on') LOG.debug('Level-2 verbose is also on') model_file_ase = '%s.pkl' % model[0] model_file_tgx = '%s.pkl' % model[1] LOG.warn('ASE model file: %s' % get_data(model_file_ase)) stan_model_ase = pickle.load(open(get_data(model_file_ase), 'rb')) LOG.debug('ASE model code\n%s' % stan_model_ase.model_code) LOG.warn('TGX model file: %s' % get_data(model_file_tgx)) stan_model_tgx = pickle.load(open(get_data(model_file_tgx), 'rb')) LOG.debug('TGX model code\n%s' % stan_model_tgx.model_code) ds = loompy.connect(loomfile, 'r') if end is None: end = ds.shape[0] LOG.warn('Genes from %d to %d (0-based indexing)' % (start, end)) c = ds.ca.Size / np.median(ds.ca.Size) LOG.debug('c: %s' % '\t'.join(c[:6].astype(str))) param = dict() processed = 0 #tgx_layer = '' #mat_layer = hapcode[0] mat_layer, pat_layer = hapcode # for g in xrange(start, end): for g in range(start, end): if ds.ra.Selected[g]: LOG.warn('Loading data for Gene %s' % ds.ra['GeneID'][g]) #n = ds.layers[tgx_layer][g] x = ds.layers[mat_layer][g] y = ds.layers[pat_layer][g] n = x + y LOG.debug('x: %s ...' % '\t'.join(x[:6].astype(int).astype(str))) LOG.debug('n: %s ...' % '\t'.join(n[:6].astype(int).astype(str))) cur_param = dict() LOG.warn('Fitting ASE with %s model' % model[0]) cur_param['ase'] = __mcmc4ase(x, n, stan_model_ase).summary()['summary'] LOG.warn('Fitting TGX with %s model' % model[1]) cur_param['tgx'] = __mcmc4tgx(n, c, stan_model_tgx).summary()['summary'] param[ds.row_attrs['GeneID'][g]] = cur_param processed += 1 LOG.info("All {:,d} genes have been processed.".format(processed)) if outfile is None: outfile = '_scbase.%05d-%05d.param.npz' % (start, end) np.savez_compressed(outfile, **param) ds.close()
def high_mem_get_data(loom_file, layer, feat_attr, cell_attr, valid_ra, valid_ca, remove_version, verbose): """ Gets relevant counts and type information for a given loom file Args: loom_file (str): Path to loom file layer (str): Layer in loom_file containing counts feat_attr (str): Row attribute containing unique feature IDs cell_attr (str): Column attribute containing unique cell IDs valid_ra (str/None): Row attribute specifying rows to include valid_ca (str/None): Column attribute specifying columns to include remove_version (bool): If True, remove GENCODE version ID verbose (bool): If true, print logging messages """ if verbose: int_log.info('Obtaining counts from layer {0} in {1}'.format(layer, loom_file)) # Get indices row_idx = utils.get_attr_index(loom_file=loom_file, attr=valid_ra, columns=False, as_bool=False, inverse=False) col_idx = utils.get_attr_index(loom_file=loom_file, attr=valid_ca, columns=True, as_bool=False, inverse=False) # Get data with loompy.connect(loom_file) as ds: dat = ds.layers[layer].sparse(row_idx, col_idx).todense() dat = pd.DataFrame(dat, index=ds.ra[feat_attr][row_idx], columns=ds.ca[cell_attr][col_idx]) # Process data if remove_version: dat.index = utils.remove_gene_version(dat.index.values) dat = dat.T return dat
def load_loom_file(self, file_path: Path, abs_file_path: Path, mode: str = "r") -> Optional[Loom]: try: loom_connection = lp.connect(abs_file_path.as_posix(), mode=mode, validate=False) return self.add_loom( file_path=file_path, abs_file_path=abs_file_path, loom_connection=loom_connection, ) except KeyError as e: logger.error(e) os.remove(file_path) logger.warning(f"Deleting malformed loom {file_path}") return None
def __init__(self, loom, schema=None, cell_type_fields=None, validate_loom=True): """ loom: path to loom file schema: path to JSON schema file. If schema not specified, attempts to use package or repo version. cell_type_fields: optionally specify one or more fields used to record cell type""" if not schema: try: schema = pkg_resources.resource_filename( "matrix_semantic_map", "json_schema/expression_matrix_semantic_map.json") assert os.path.isfile(schema) is True except: try: schema = pkg_resources.resource_filename( "matrix_semantic_map", "../json_schema/expression_matrix_semantic_map.json") assert os.path.isfile(schema) is True except FileNotFoundError: warnings.warn( "Schema file (expression_matrix_semantic_map.json) " "not found in expected default location for package" " installation or running from repo. Please specify" " location via schema argument.") else: pass else: pass self.loom = loom # Connect and close when used. self.validate_loom = validate_loom self.semantic_map = {"semantic_map": []} with loompy.connect(loom, validate=self.validate_loom) as lc: if 'semantic_map' in lc.attrs.keys(): self.semantic_map = json.loads(lc.attrs.semantic_map) self.validator = get_validator(schema) if cell_type_fields: for f in cell_type_fields: self.map_cell_type_field(f) self.ols = OLSQueryWrapper()
def _load_loom_data_set(paths): values = labels = example_names = feature_names = batch_indices = None with loompy.connect(paths["all"]["full"]) as data_file: values = data_file[:, :].T n_examples, n_features = values.shape if "ClusterID" in data_file.ca: cluster_ids = data_file.ca["ClusterID"].flatten() if "CellTypes" in data_file.attrs: class_names = numpy.array(data_file.attrs["CellTypes"]) class_name_from_class_id = numpy.vectorize( lambda class_id: class_names[int(class_id)]) labels = class_name_from_class_id(cluster_ids) else: labels = cluster_ids if "Cell" in data_file.ca: example_names = data_file.ca["Cell"].flatten() else: example_names = numpy.array( ["Cell {}".format(j + 1) for j in range(n_examples)]) if "Gene" in data_file.ra: feature_names = data_file.ra["Gene"].flatten() else: feature_names = numpy.array( ["Gene {}".format(j + 1) for j in range(n_features)]) if "BatchID" in data_file.ca: batch_indices = data_file.ca["BatchID"].flatten() data_dictionary = { "values": values, "labels": labels, "example names": example_names, "feature names": feature_names, "batch indices": batch_indices } return data_dictionary
def load_loom_file(self, partial_md5_hash, file_path, abs_file_path, rw=False): # if rw: # loom = lp.connect(file_path, mode='r+') # else: # loom = lp.connect(file_path, mode='r')\ try: loom_connection = lp.connect(abs_file_path, mode='r+') except KeyError as e: print(e) os.remove(file_path) return None return self.add_loom(partial_md5_hash=partial_md5_hash, file_path=file_path, abs_file_path=abs_file_path, loom_connection=loom_connection)
def calculate_ss2_metrics_loom(loom_url): """Calculate metrics for a loom file.""" temp_dir = tempfile.mkdtemp(suffix="loom_test") local_loom_path = os.path.join(temp_dir, os.path.basename(loom_url)) response = requests.get(loom_url, stream=True) with open(local_loom_path, "wb") as local_loom_file: shutil.copyfileobj(response.raw, local_loom_file) ds = loompy.connect(local_loom_path) expression_sum = numpy.sum(ds[:, :]) expression_nonzero = numpy.count_nonzero(ds[:, :]) cell_count = ds.shape[1] return { "expression_sum": expression_sum, "expression_nonzero": expression_nonzero, "cell_count": cell_count }
def test_loom(self, mock_upload_method): """Test the loom output.""" args = argparse.Namespace(request_id="test_id", expression_manifest_key=EXPRESSION_MANIFEST, cell_metadata_manifest_key=CELL_MANIFEST, gene_metadata_manifest_key=GENE_MANIFEST, target_path="test.loom", format="loom", working_dir=".") with mock.patch("matrix.docker.matrix_converter.RequestTracker") as mock_request_tracker, \ mock.patch("os.remove"): matrix_converter = MatrixConverter(args) matrix_converter.FS = s3fs.S3FileSystem(anon=True) mock_request_tracker.return_value.creation_date = "1983-10-11T000000.00Z" matrix_converter.run() test_loom = loompy.connect("test.loom") self.assertListEqual(test_loom.ca["CellID"].tolist(), list(self.direct_expression.keys())) col = 0 for cellkey in test_loom.ca["CellID"]: loom_cell_expr = { k: v for k, v in zip(test_loom.ra["Accession"], test_loom[:, col]) if v != 0 } direct_cell_expr = self.direct_expression[cellkey] self.assertListEqual(list(loom_cell_expr.keys()), list(direct_cell_expr.keys())) for gene in loom_cell_expr: self.assertAlmostEqual(loom_cell_expr[gene], direct_cell_expr[gene], places=2) col += 1
def gene_signature_wizard_main(loomfile=None, signaturefile=None): """ Parameters ---------- loomfile : (Default value = None) signaturefile : (Default value = None) Returns ------- """ print(loomfile) if loomfile is None: loomfile = click.prompt( "Loom file that you would like to augment with a gene signature: ") while not (os.path.isfile(loomfile) and loomfile.endswith('.loom')): loomfile = click.prompt( "Not a loom file. Please select loom file that you would like to augment with cnv/segmentation data: " ) if signaturefile is None: signaturefile = click.prompt( "Gene list that you would like to add as a gene signature (headerless file, single column): " ) signature = np.genfromtxt(signaturefile, dtype=str) with loompy.connect(loomfile, validate=False) as loom: proceed = 'y' if len(np.intersect1d(signature, loom.ra['gene'])) < len(signature): proceed = click.prompt( "The following genes ({} in total) in the given signature\n{}\nare not in the loom file. Would you like to proceed with those that are ({} genes in total)?" .format(len(np.setdiff1d(signature, loom.ra['gene'])), ", ".join(np.setdiff1d(signature, loom.ra['gene'])), len(np.intersect1d(signature, loom.ra['gene']))), type=click.Choice(['n', 'y']), default='y') if proceed == 'y': signature_name = click.prompt( "What would you like to name this signature?", default=signaturefile.split('/')[-1].split('.')[0::-1][0]) loom.ra[signature_name] = np.isin(loom.ra['gene'], signature)
def read_loom(filename: str, tag: str = None): with lp.connect(filename, mode="r", validate=False) as loom: # Load the content into memory # Set the main matrix ex_mtx = pd.DataFrame(loom[:, :], index=loom.ra.Gene, columns=loom.ca.CellID).T # Set the column, row and global attribute using the underlying Dict of the AttributeManager col_attrs = {k: v for k, v in loom.ca.items()} row_attrs = {k: v for k, v in loom.ra.items()} global_attrs = {k: v for k, v in loom.attrs.items()} # Decompress and decode the MetaData global attribute try: global_attrs["MetaData"] = SCopeLoom.decompress_decode( value=global_attrs["MetaData"]) except Exception: # MetaData is uncompressed global_attrs["MetaData"] = json.loads(global_attrs["MetaData"]) scope_loom = SCopeLoom( filename=filename, ex_mtx=ex_mtx, col_attrs=col_attrs, row_attrs=row_attrs, global_attrs=global_attrs, tag=tag, ) if "embeddings" in scope_loom.get_meta_data(): scope_loom.convert_loom_embeddings_repr_to_internal_repr() # If multi-runs mode is_multi_runs_mode = scope_loom.has_scenic_multi_runs_data() if is_multi_runs_mode: scope_loom.set_scenic_min_genes_regulon( min_genes_regulon=global_attrs["MetaData"]["regulonSettings"] ["min_genes_regulon"]) scope_loom.set_scenic_min_regulon_gene_occurrence( min_regulon_gene_occurrence=global_attrs["MetaData"] ["regulonSettings"]["min_regulon_gene_occurrence"]) return scope_loom
def load_loom_file(self, partial_md5_hash: str, file_path: str, abs_file_path: str, mode: str = "r"): try: loom_connection = lp.connect(abs_file_path, mode=mode, validate=False) except KeyError as e: logger.error(e) os.remove(file_path) logger.warning(f"Deleting malformed loom {file_path}") return None return self.add_loom( partial_md5_hash=partial_md5_hash, file_path=file_path, abs_file_path=abs_file_path, loom_connection=loom_connection, )
def preprocess(self): print("Preprocessing smFISH dataset") ds = loompy.connect(self.save_path + self.download_name) select = ds[:, :].sum( axis=0) > 0 # Take out cells that doesn't express any gene labels, cell_types = np.array(ds.ca['ClusterID']), np.array( ds.ca['ClusterName']) labels = np.reshape(labels, (labels.shape[0], 1))[select] cell_types = np.reshape(cell_types, (cell_types.shape[0], 1))[select] x_coord, y_coord = np.array(ds.ca['X']), np.array(ds.ca['Y']) x_coord = np.reshape(x_coord, (x_coord.shape[0], 1))[select] y_coord = np.reshape(y_coord, (y_coord.shape[0], 1))[select] data = ds[:, select].T # change matrix to cells by genes ds.close() print("Finished preprocessing smFISH dataset") return data, labels, cell_types, x_coord, y_coord
def get_pct(loom_file, num_val, axis=0): """ Calculates the percentage of a given number over a given loom axis Args: loom_file (str): Path to loom file num_val (int): Number to calculate percentage with axis (int): Axis to calculate percentage with 0: rows 1: columns Returns: pct (float): Percentage of num_val/axis * 100 """ if axis == 0 or axis == 1: with loompy.connect(filename=loom_file, mode='r') as ds: pct = num_val / ds.shape[axis] * 100 else: raise ValueError('Axis must be 0 or 1') return pct
def get_loom(self, loom_file_path: Path, mode: str = "r") -> Loom: abs_loom_file_path = self.get_loom_absolute_file_path(loom_file_path) with self.file_locks[abs_loom_file_path]: if not abs_loom_file_path.exists(): logger.error(f"The file {loom_file_path} does not exists.") raise ValueError( f"The file located at {abs_loom_file_path} does not exist." ) if abs_loom_file_path in self.active_looms: logger.debug("Should be preloaded") loom = self.active_looms[abs_loom_file_path] try: logger.debug( f"Current mode: {self.active_looms[abs_loom_file_path].get_connection().mode}, wanted mode {mode}" ) if self.active_looms[abs_loom_file_path].get_connection( ).mode == mode: logger.debug( f"Returning pre-loaded loom file {loom_file_path}. Object {id(loom)}" ) return loom else: logger.error( f"Mode {mode} was requested for {loom_file_path}, but mode is currently {self.active_looms[abs_loom_file_path].get_connection().mode}" ) except AttributeError: logger.error("Loom was previously closed") loom.loom_connection = lp.connect( abs_loom_file_path.as_posix(), mode=mode, validate=False) else: loom = self.load_loom_file(mode=mode, file_path=loom_file_path, abs_file_path=abs_loom_file_path) logger.debug( f"Returning newly loaded loom file {loom_file_path}. Object {id(loom)}, mode {loom.get_connection().mode}" ) return loom
def validate(self, path: str, strictness: str = "speconly") -> bool: """ Validate a file for conformance to the Loom specification Args: path: Full path to the file to be validated strictness: "speconly" or "conventions" Remarks: In "speconly" mode, conformance is assessed relative to the file format specification at http://linnarssonlab.org/loompy/format/. In "conventions" mode, conformance is additionally assessed relative to attribute name and data type conventions given at http://linnarssonlab.org/loompy/conventions/. """ valid1 = True if self.backend == "hdf5": open_func = h5py.File elif self.backend == "zarr": open_func = zarr.open_group f = open_func(path, mode="r") if self.version == None: self.version = get_loom_spec_version(f) valid1 = self.validate_spec(f) if not valid1: self.errors.append( "For help, see http://linnarssonlab.org/loompy/format/") if self.backend == "hdf5": f.close() valid2 = True if strictness == "conventions": with loompy.connect(path, mode="r") as ds: valid2 = self.validate_conventions(ds) if not valid2: self.errors.append( "For help, see http://linnarssonlab.org/loompy/conventions/" ) return valid1 and valid2
def __init__(self, project: str, filename: str, file_path: str, callback_on_close: Callable = None, close_connection_on_exit: bool = True) -> None: self.project = project self.filename = filename self.file_path = file_path self.close_connection_on_exit = close_connection_on_exit self.callback_on_close = callback_on_close self._closed = False self.ds = None try: # TODO: when loompy library is updated with a default # Unix timestamp for missing time fields, this should # be set back to 'r' for safety reasons self.ds = loompy.connect(file_path, 'r+') except Exception as e: logging.warning("Could not open loom file at %s, closing LoomExpand object", file_path) if self.ds is not None: self.ds.close(True) self.ds = None if self.callback_on_close is not None: self.callback_on_close(self) self._closed = True raise e
def create_subsetted_loom_with_genemask(loom, output_loom, cellmask, genemask): """Deprecated. Parameters ---------- loom : output_loom : cellmask : genemask : Returns ------- """ print("THIS FUNCTION IS DEPRECATED, USE loompy.new INSTEAD!!!") import loompy from panopticon.utilities import recover_meta if '' not in loom.layers.keys(): raise Exception("Expecting '' layer, yet none found") rowmeta, colmeta = recover_meta(loom) if len(genemask) != loom.shape[0] or len(cellmask) != loom.shape[1]: raise Exception( "genemask and cellmask must be boolean masks with length equal to the number of rows and columns of loom, respectively" ) loompy.create(output_loom, loom[''][genemask.nonzero()[0], :][:, cellmask.nonzero()[0]], rowmeta[genemask].to_dict("list"), colmeta[cellmask].to_dict("list")) with loompy.connect(output_loom) as smallerloom: for layer in [x for x in loom.layer.keys() if x != '']: smallerloom[layer] = loom[layer][:, cellmask.nonzero()[0]][ genemask.nonzero()[0], :]
def check_pca_batches(loom_file, n_pca=50, batch_size=512, verbose=False): """ Checks and adjusts batch size for PCA Args: loom_file (str): Path to loom file n_pca (int): Number of components for PCA batch_size (int): Size of chunks verbose (bool): Print logging messages Returns: batch_size (int): Updated batch size to work with PCA """ # Get the number of cells with loompy.connect(loom_file) as ds: num_total = ds.shape[1] # Check if batch_size and PCA are even reasonable if num_total < n_pca: err_msg = 'More PCA components {0} than samples {1}'.format( n_pca, num_total) if verbose: decomp_log.error(err_msg) raise ValueError(decomp_log) if batch_size < n_pca: batch_size = n_pca # Adjust based on expected size mod_total = num_total % batch_size adjusted_batch = False if mod_total < n_pca: adjusted_batch = True batch_size = batch_size - n_pca + mod_total if batch_size < n_pca: batch_size = num_total # Report to user if verbose and adjusted_batch: decomp_log.info( 'Adjusted batch size to {0} for PCA'.format(batch_size)) # Return value return batch_size
def load_loom(filename): """Load data from a loom file From github.com/simslab/scHPF Parameters ---------- filename: str file to load Returns ------- coo : coo_matrix cell x gene sparse count matrix genes : Dataframe Dataframe of gene attributes. Attributes are ordered so Accession and Gene are the first columns, if those attributs are present cells : Dataframe Dataframe of cell attributes """ import loompy # load the loom file with loompy.connect(filename) as ds: loom_genes = pd.DataFrame(dict(ds.ra.items())) loom_cells = pd.DataFrame(dict(ds.ca.items())) loom_coo = ds.sparse().T # order gene attributes so Accession and Gene are the first two columns, # if they are present first_cols = [] for colname in ['Accession', 'Gene']: if colname in loom_genes.columns: first_cols.append(colname) rest_cols = loom_genes.columns.difference(first_cols).tolist() loom_genes = loom_genes[first_cols + rest_cols] return loom_coo, loom_genes, loom_cells
def test_ops(self): # Filter should return four of the five test bundles self.request_ids = self._post_matrix_service_request(filter_={ "op": "and", "value": [{ "op": "=", "field": "library_preparation_protocol.library_construction_method.ontology", "value": "EFO:0008931" }, { "op": "!=", "field": "derived_organ_label", "value": "decidua" }, { "op": "in", "field": "dss_bundle_fqid", "value": INPUT_BUNDLE_IDS[self.dss_env] }] }, format_="loom") WaitFor(self._poll_all_requests_in_status, self.request_ids, MatrixRequestStatus.COMPLETE.value)\ .to_return_value(True, timeout_seconds=1200) matrix_location = self._retrieve_matrix_location( self.request_ids[GenusSpecies.HUMAN.value]) temp_dir = tempfile.mkdtemp(suffix="loom_ops_test") local_loom_path = os.path.join(temp_dir, os.path.basename(matrix_location)) response = requests.get(matrix_location, stream=True) with open(local_loom_path, "wb") as local_loom_file: shutil.copyfileobj(response.raw, local_loom_file) ds = loompy.connect(local_loom_path) self.assertEqual(ds.shape[1], 4)
def continuous_loom(input_file): """Continuous matrix attribute handler for loom files Supported file format may be different for each server, this function maps loom-specific attributes to a general data structure that can be used by all content testing functions regardless of file format Arguments: input_file (str): input loom file Returns: (dict): attribute handler, consistent structure regardless of file type """ # connects to the loom file, then remaps loom specific attributes to general # attribute names which are used in the content testing functions ds = loompy.connect(input_file) return { "Track": ds.ra.tracks, "Position": ds.ca.position, "Value": ds, "FH": ds # loom file handle, should be closed after content testing }
def _load_data(self, skip_row=None, skip_col=None, **kwargs): with lp.connect(self._file_name) as ds: X = ds[:, :].T if skip_row is not None: mask = np.array([not skip_row(i) for i in range(X.shape[1])]) self._use_rows_mask = mask else: self._use_rows_mask = np.ones(X.shape[1], dtype=bool) if skip_col is not None: mask = np.array([not skip_col(i) for i in range(X.shape[0])]) self._use_cols_mask = mask else: self._use_cols_mask = np.ones(X.shape[0], dtype=bool) X = X[self._use_cols_mask, :] X = X[:, self._use_rows_mask] gene_names = ds.ra.Gene[self._use_rows_mask] \ if hasattr(ds.ra, "Gene") else [] attrs = [ContinuousVariable.make(str(g)) for g in gene_names] meta_df = pd.DataFrame( {key: ds.ca[key][self._use_cols_mask] for key in ds.ca.keys()}) return attrs, X, meta_df, meta_df.index
def calculate_ss2_metrics_loom(loom_url): """Calculate metrics for a loom file.""" temp_dir = tempfile.mkdtemp(suffix="loom_zip_test") local_loom_zip_path = os.path.join(temp_dir, os.path.basename(loom_url)) response = requests.get(loom_url, stream=True) with open(local_loom_zip_path, "wb") as local_loom_zip_file: shutil.copyfileobj(response.raw, local_loom_zip_file) loom_zip = zipfile.ZipFile(local_loom_zip_path) loom_name = [n for n in loom_zip.namelist() if n.endswith(".loom")][0] loom_zip.extractall() ds = loompy.connect(loom_name) expression_sum = numpy.sum(ds[:, :]) expression_nonzero = numpy.count_nonzero(ds[:, :]) cell_count = ds.shape[1] return { "expression_sum": expression_sum, "expression_nonzero": expression_nonzero, "cell_count": cell_count }
def __init__(self, loom_file_path, total_clusters=6): self.total_clusters = total_clusters self.ds = loompy.connect(loom_file_path) self.spliced = self.ds.layer["spliced"][:, :].astype(np.dtype(float)) self.unspliced = self.ds.layer["unspliced"][:, :].astype(np.dtype(float)) self.ambig = self.ds.layer["ambiguous"][:, :].astype(np.dtype(float)) self.spliced = np.transpose(self.spliced) self.unspliced = np.transpose(self.unspliced) self.ambig = np.transpose(self.ambig) self.cells = np.stack((self.spliced, self.unspliced, self.ambig)) ca = dict(self.ds.col_attrs.items()) self.clusters = ca["Clusters"][:] print(self.unspliced.shape) print(self.cells.shape) self.cells = np.transpose(self.cells, (1, 0, 2)) self.cells = sphere_data(self.cells) print(self.cells.shape) # for i in range(100): # print(self.spliced[i][i], self.unspliced[i][i], self.ambig[i][i]) # print(self.cells[i, i]) print("len cells", len(self.cells)) print("shape cells[0]", self.cells[0].shape)
def read_loom( file_path: str, mode_type: str = "rna", force_conversion={"annotations": False, "metrics": False}, ) -> LoomX: try: _mode_type = ModeType(mode_type) except: mode_types = list(filter(lambda x: x != "_", [w.value for w in (ModeType)])) raise Exception( f"The given mode type '{mode_type}' does not exist. Choose one of: {', '.join(mode_types)}." ) with lp.connect(filename=file_path, mode="r", validate=False) as loom_connection: if any( list(map(lambda x: x in loom_connection.attrs, GLOBAL_ATTRIBUTE_KEY_VX)) ): return _read_scope_loom( loom_connection=loom_connection, mode_type=_mode_type, force_conversion=force_conversion, ) raise Exception(f"Unable to read the loom at {file_path}")