def binarize(self): from pyscenic.binarization import binarize from pyscenic.binarization import plot_binarization #Binarize AUCell_mtx binar, auc_thresholds = binarize(self.auc_mtx) return binar
auc_mtx = aucell(data, cortest_passed_regulons, num_workers=n_cores) auc_mtx.to_csv(final_regulons_aucell_fname, sep=',', header=True, index=True, compression='gzip') ## Calculate mean score per regulon from multiple iterations auc_mtx = auc_mtx.T auc_mtx['regulon'] = [re.sub('[.].*', '', i) for i in auc_mtx.index] auc_mtx_mean = auc_mtx.groupby('regulon').mean().T auc_mtx_mean.to_csv(final_regulons_aucell_means_fname, sep=',', header=True, index=True, compression='gzip') ## Binarize mean scores bin_mtx, _ = binarize(auc_mtx_mean, num_workers=n_cores) bin_mtx.to_csv(final_regulons_aucell_means_bin_fname, sep=',', header=True, index=True, compression='gzip') ## Calculate Regulon Specificity Scores (RSS) (doi: 10.1016/j.celrep.2018.10.045) rss = regulon_specificity_scores(auc_mtx_mean, metadata['celltype']) rss.to_csv(final_regulons_aucell_means_rss_fname, sep=',', header=True, index=True, compression='gzip') ## Plot RSSs cts = rss.index.sort_values() fig, axs = plt.subplots(4, 4, figsize=[15, 15]) for i in range(0, 4): for j in range(0, 4):
def append_auc_mtx( fname: str, ex_mtx: pd.DataFrame, auc_mtx: pd.DataFrame, regulons: Sequence[Type[GeneSignature]], seed=None, num_workers=1, ) -> None: """ Append AUC matrix to loom file. :param fname: The name of loom file to be append to. :param auc_mtx: The matrix that contains the AUC values. :param regulons: Collection of regulons that were used for calculation of the AUC values. """ # Fetch sequence logo from regulon's context. def fetch_logo(context): for elem in context: if elem.endswith('.png'): return elem return "" try: name2logo = {reg.name: fetch_logo(reg.context) for reg in regulons} except AttributeError: name2logo = {} # Binarize matrix for AUC thresholds. _, auc_thresholds = binarize(auc_mtx, seed=seed, num_workers=num_workers) regulon_thresholds = [{ "regulon": name, "defaultThresholdValue": (threshold if isinstance(threshold, float) else threshold[0]), "defaultThresholdName": "gaussian_mixture_split", "allThresholds": { "gaussian_mixture_split": (threshold if isinstance(threshold, float) else threshold[0]) }, "motifData": name2logo.get(name, ""), } for name, threshold in auc_thresholds.iteritems()] # Calculate the number of genes per cell. binary_mtx = ex_mtx.copy() binary_mtx[binary_mtx != 0] = 1.0 ngenes = binary_mtx.sum(axis=1).astype(int) # Encode genes in regulons as "binary" membership matrix. genes = np.array(ex_mtx.columns) n_genes = len(genes) n_regulons = len(regulons) data = np.zeros(shape=(n_genes, n_regulons), dtype=int) for idx, regulon in enumerate(regulons): data[:, idx] = np.isin(genes, regulon.genes).astype(int) regulon_assignment = pd.DataFrame(data=data, index=ex_mtx.columns, columns=list( map(attrgetter('name'), regulons))) # Create meta-data structure. def create_structure_array(df): # Create a numpy structured array return np.array([tuple(row) for row in df.values], dtype=np.dtype(list(zip(df.columns, df.dtypes)))) with lp.connect(fname, validate=False) as ds: # The orientation of the loom file is always: # - Columns represent cells or aggregates of cells # - Rows represent genes ds.ca[ATTRIBUTE_NAME_REGULONS_AUC] = create_structure_array(auc_mtx) ds.ra[ATTRIBUTE_NAME_REGULONS] = create_structure_array( regulon_assignment) if ATTRIBUTE_NAME_METADATA in ds.attrs: try: meta_data = json.loads(ds.attrs[ATTRIBUTE_NAME_METADATA]) except json.decoder.JSONDecodeError: meta_data = decompress_meta(ds.attrs[ATTRIBUTE_NAME_METADATA]) else: meta_data = {} meta_data["regulonThresholds"] = regulon_thresholds ds.attrs[ATTRIBUTE_NAME_METADATA] = compress_meta(meta_data)
regulons = load_from_yaml(REGULONS_BIN_FNAME) print("LOADED regulons, type:") print(type(regulons)) #print(regulons) #-------------Phase III: Cellular regulon enrichment matrix (aka AUCell)---------------- print("STARTING PHASE III") #auc_mtx = aucell(ex_matrix, regulons, num_cores=nCores) #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores auc_mtx = aucell( ex_matrix, regulons, num_workers=nCores ) #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores auc_mtx.to_csv(AUC_FNAME, sep='\t') print("DEFINED auc_mtx") #auc_mtx = pd.read_csv(AUC_FNAME, sep='\t', header=0, index_col=0) #clustermap = sns.clustermap(auc_mtx, figsize=(8,8)) #clustermap.savefig(CLUSTERMAP_FNAME) #-------------Phase IV: BINARIZATION auc_binary, auc_thresholds = binarize(auc_mtx) print(auc_binary) auc_binary.to_csv(BINARYAUC_FNAME, sep='\t') auc_thresholds.to_csv(BINARYTHR_FNAME, sep='\t') print("FINISHED!")
#from dask.diagnostics import ProgressBar #from arboreto.utils import load_tf_names #from arboreto.algo import grnboost2 from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase from pyscenic.utils import modules_from_adjacencies, load_motifs from pyscenic.prune import prune2df, df2regulons from pyscenic.aucell import aucell from pyscenic.binarization import binarize with open(snakemake.input[0], "rb") as f: regulons = pickle.load(f) ex_matrix = pd.read_csv(snakemake.input[1], sep='\t', header=0, index_col=0).T print("mtx print") auc_mtx = aucell(ex_matrix, regulons) thresholds, mat = binarize(auc_mtx) print("binarize done") print("binarise save") thresholds.to_csv(snakemake.output[0])
def binarize_regulon_enrichment(self): _, auc_thresholds = binarize(self.auc_mtx) return auc_thresholds