def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx'): # csv/tsv return df2regulons( load_motifs(fname, sep=suffixes_to_separator(extension))) elif is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.gmt' in extension: sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif '.dat' in extension: with openfile(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]: """ Load genes signatures from disk. Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs). :param fname: The name of the file that contains the signatures. :return: A list of gene signatures. """ extension = os.path.splitext(fname)[1].lower() if extension in FILE_EXTENSION2SEPARATOR.keys(): return df2regulons( load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension])) elif extension in {'.yaml', '.yml'}: return load_from_yaml(fname) elif extension.endswith('.gmt'): sep = guess_separator(fname) return GeneSignature.from_gmt(fname, field_separator=sep, gene_separator=sep) elif extension == '.dat': with open(fname, 'rb') as f: return pickle.load(f) else: raise ValueError("Unknown file format \"{}\".".format(fname))
def _load_modules(fname: str) -> Sequence[Type[GeneSignature]]: # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. if fname.endswith('.yaml') or fname.endswith('.yml'): return load_from_yaml(fname) elif fname.endswith('.dat'): with open(fname, 'rb') as f: return pickle.load(f) else: LOGGER.error("Unknown file format for \"{}\".".format(fname)) sys.exit(1)
def run(args): # Set logging level. logging_debug_opt = False LOGGER.addHandler(create_logging_handler(logging_debug_opt)) LOGGER.setLevel(logging.DEBUG) LOGGER.info("Using configuration {}.".format(args.config_filename)) cfg = ConfigParser() cfg.read(args.config_filename) in_fname = cfg['data']['modules'] if not args.input else args.input LOGGER.info("Loading modules from {}.".format(in_fname)) # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml if in_fname.endswith('.yaml'): modules = load_from_yaml(in_fname) else: with open(in_fname, 'rb') as f: modules = pickle.load(f) # Filter out modules with to few genes. min_genes = int(cfg['parameters']['min_genes']) modules = list(filter(lambda m: len(m) >= min_genes, modules)) LOGGER.info("Loading databases.") def name(fname): return os.path.splitext(os.path.basename(fname))[0] db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";"))) dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] LOGGER.info("Calculating regulons.") motif_annotations_fname = cfg['data']['motif_annotations'] mode = cfg['parameters']['mode'] with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar(): df = prune2df(dbs, modules, motif_annotations_fname, rank_threshold=int(cfg['parameters']['rank_threshold']), auc_threshold=float(cfg['parameters']['auc_threshold']), nes_threshold=float(cfg['parameters']['nes_threshold']), client_or_address=mode, module_chunksize=cfg['parameters']['chunk_size'], num_workers=args.num_workers) LOGGER.info("Writing results to file.") df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]: # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. extension = PurePath(fname).suffixes if is_valid_suffix(extension, 'ctx_yaml'): return load_from_yaml(fname) elif '.dat' in extension: with openfile(fname, 'rb') as f: return pickle.load(f) elif '.gmt' in extension: return GeneSignature.from_gmt(fname) else: raise ValueError("Unknown file format for \"{}\".".format(fname))
N_SAMPLES = ex_matrix.shape[0] # Full dataset print(N_SAMPLES) #read in modules with open(MODULES_BIN_FNAME, "rb") as f: modules = pickle.load(f) print("LOADED modules, type:") print(type(modules)) #print(modules) #long list #-------------Phase II: Prune modules for targets with cis regulatory footprints (aka RcisTarget)---- print("STARTING PHASE II") regulons = load_from_yaml(REGULONS_BIN_FNAME) print("LOADED regulons, type:") print(type(regulons)) #print(regulons) #-------------Phase III: Cellular regulon enrichment matrix (aka AUCell)---------------- print("STARTING PHASE III") #auc_mtx = aucell(ex_matrix, regulons, num_cores=nCores) #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores auc_mtx = aucell( ex_matrix, regulons, num_workers=nCores ) #don't transpose ex_matrix again as it was already transposed above #originally num_workers, but it should be num_cores auc_mtx.to_csv(AUC_FNAME, sep='\t') print("DEFINED auc_mtx")