def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): # csv/tsv return df2regulons( load_motifs(fname, sep=suffixes_to_separator(extension))) elif is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.gmt' in extension: sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif '.dat' in extension: with openfile(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def save_enriched_motifs(df, fname: str) -> None: """ Save enriched motifs. Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML. :param df: :param fname: :return: """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): df.to_csv(fname, sep=suffixes_to_separator(extension)) else: regulons = df2regulons(df) if '.json' in extension: name2targets = { r.name: list(r.gene2weight.keys()) for r in regulons } with openfile(fname, 'w') as f: f.write(json.dumps(name2targets)) elif '.dat' in extension: with openfile(fname, 'wb') as f: pickle.dump(regulons, f) elif '.gmt' in extension: GeneSignature.to_gmt(fname, regulons) elif is_valid_suffix(extension, 'ctx_yaml'): save_to_yaml(regulons, fname) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def save_enriched_motifs(df, fname: str) -> None: """ Save enriched motifs. Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML. :param df: :param fname: :return: """ extension = os.path.splitext(fname)[1].lower() if extension in FILE_EXTENSION2SEPARATOR.keys(): df.to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension]) else: regulons = df2regulons(df) if extension == '.json': name2targets = { r.name: list(r.gene2weight.keys()) for r in regulons } with open(fname, 'w') as f: f.write(json.dumps(name2targets)) elif extension == '.dat': with open(fname, 'wb') as f: pickle.dump(regulons, f) elif extension == '.gmt': GeneSignature.to_gmt(fname, regulons) elif extension in {'.yaml', '.yml'}: save_to_yaml(regulons, fname) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = os.path.splitext(fname)[1].lower() if extension in FILE_EXTENSION2SEPARATOR.keys(): return df2regulons( load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension])) elif extension in {'.yaml', '.yml'}: return load_from_yaml(fname) elif extension.endswith('.gmt'): sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif extension == '.dat': with open(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def prune_targets_command(args): """ Prune targets/find enriched features. """ # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. if any( args.module_fname.name.endswith(ext) for ext in FILE_EXTENSION2SEPARATOR.keys()): LOGGER.info("Creating modules.") modules = _df2modules(args) else: LOGGER.info("Loading modules.") modules = _load_modules(args.module_fname.name) if len(modules) == 0: LOGGER.error("Not a single module loaded") sys.exit(1) LOGGER.info("Loading databases.") dbs = _load_dbs(args.database_fname) LOGGER.info("Calculating regulons.") motif_annotations_fname = args.annotations_fname.name calc_func = find_features if args.no_pruning == "yes" else prune2df with ProgressBar( ) if args.mode == "dask_multiprocessing" else NoProgressBar(): out = calc_func(dbs, modules, motif_annotations_fname, rank_threshold=args.rank_threshold, auc_threshold=args.auc_threshold, nes_threshold=args.nes_threshold, client_or_address=args.mode, module_chunksize=args.chunk_size, num_workers=args.num_workers) LOGGER.info("Writing results to file.") if args.output_type == 'csv': out.to_csv(args.output) else: name2targets = { r.name: list(r.gene2weight.keys()) for r in df2regulons(out) } args.output.write(json.dumps(name2targets)) args.output.close()
motif_enrichment_table = pd.read_pickle(path=f) elif f.endswith('.csv') or f.endswith('.csv.gz'): motif_enrichment_table = utils.read_feature_enrichment_table( fname=args.motif_enrichment_table_fname.name, sep=",") else: raise Exception( "VSN ERROR: The aggregated feature enrichment table is in the wrong format. Expecting .pickle or .csv formats." ) print(f"... took {time.time() - start} seconds to run.", flush=True) print(f"Making the regulons...", flush=True) start = time.time() regulons = transform.df2regulons(df=motif_enrichment_table, save_columns=[ COLUMN_NAME_NES, COLUMN_NAME_ORTHOLOGOUS_IDENTITY, COLUMN_NAME_MOTIF_SIMILARITY_QVALUE, COLUMN_NAME_ANNOTATION ]) print(f"{len(regulons)} regulons from df2regulons.") # Read the signatures saved in out/multi_runs_regulons_[mtf|trk] # Keep all regulons and targets (so that all can be visualized in SCope) signatures = utils.read_signatures_from_tsv_dir(dpath=args.signatures_fname, noweights=False, weight_threshold=0, min_genes=0) print( f"{len(signatures)} all regulons from out/multi_runs_regulons_[mtf|trk].") # Filter regulons (regulons from motifs enrichment table) by the filtered signatures
sc.settings.verbosity = 3 plt.rcParams["axes.grid"] = False # Run time for demo data: several minutes. DATA_FOLDER = './Data/' exp_matrix = pd.read_csv(os.path.join(DATA_FOLDER, 'DC_exp.csv'), index_col=0) exp_meta = pd.read_csv(os.path.join(DATA_FOLDER, 'DC_meta.txt'), sep='\t', index_col=0) ## Generate the regulons Motifs_NAME = 'exp_matrix' motifs = load_motifs( os.path.join(DATA_FOLDER, '{}.motifs.csv'.format(Motifs_NAME))) regulons = df2regulons(motifs) ## Make a meta matrix for the regulon reg_num = [] reg_target = [] reg_tf = [] for i in regulons: reg_tf.append(i.transcription_factor) reg_target.append(list(i.gene2weight.keys())) reg_num.append(len(list(i.gene2weight.keys()))) reg_meta = pd.DataFrame([reg_num, reg_target]).T reg_meta.index = reg_tf reg_meta.columns = ['n_targets', 'targets'] ## Calculate the AUCell scores
##df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local" #df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local" #print("DEFINED df, type:") #print(type(df)) #regulons = df2regulons(df, NOMENCLATURE) from dask.diagnostics import ProgressBar with ProgressBar(): df = prune2df( dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local" print("DEFINED df, type:") print(type(df)) regulons = df2regulons(df, NOMENCLATURE) else: from dask.diagnostics import ProgressBar with ProgressBar(): #regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local" regulons = prune( dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local" print("DEFINED regulons") print(type(regulons)) else: if calcRegulonsWithIntermediateDf: df = prune2df(dbs,
DATABASES_GLOB = os.path.join("resources/network_analysis", "mm10_*.mc9nr.feather") MOTIF_ANNOTATIONS_FNAME = os.path.join("resources/network_analysis", "motifs-v9-nr.mgi-m0.001-o0.0.tbl") db_fnames = glob.glob(DATABASES_GLOB) def name(fname): return os.path.splitext(os.path.basename(fname))[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] print(dbs) modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, num_workers=4) print("prune2df done, now saving") with open(snakemake.output[0], "wb") as f: pickle.dump(df, f) print("df2regulons carrying out") regulons = df2regulons(df) print("prunedone, now saving") with open(snakemake.output[1], "wb") as f: pickle.dump(regulons, f)