def calc_modules(adjacencies, exp_mtx, name, rho_dichotomize, rho_threshold=None, mask_dropouts=None): if rho_dichotomize: print('{} - {} masking - rho threshold {}'.format( name, "with" if mask_dropouts else "without", rho_threshold)) out_fname = os.path.join( RESOURCES_FOLDER, "{}.{}.{}".format(name, rho_threshold, MODULES_EXT) if mask_dropouts else "{}.{}.no_mask.{}".format( name, rho_threshold, MODULES_EXT)) else: print('{} - all'.format(name)) out_fname = os.path.join(RESOURCES_FOLDER, "{}.all.{}".format(name, MODULES_EXT)) if os.path.isfile(out_fname): return modules = list( modules_from_adjacencies(adjacencies, exp_mtx, rho_dichotomize=rho_dichotomize, rho_threshold=rho_threshold, rho_mask_dropouts=mask_dropouts)) print(len(modules)) with open(out_fname, 'wb') as f: pickle.dump(modules, f)
def adjacencies2modules(args): try: adjacencies = load_adjacencies(args.module_fname.name) except ValueError as e: LOGGER.error(e) sys.exit(1) LOGGER.info("Loading expression matrix.") try: ex_mtx = load_exp_matrix(args.expression_mtx_fname.name, (args.transpose == 'yes'), False, # sparse loading is disabled here for now args.cell_id_attribute, args.gene_attribute) except ValueError as e: LOGGER.error(e) sys.exit(1) return modules_from_adjacencies(adjacencies, ex_mtx, thresholds=args.thresholds, top_n_targets=args.top_n_targets, top_n_regulators=args.top_n_regulators, min_genes=args.min_genes, rho_mask_dropouts=args.mask_dropouts, keep_only_activating=(args.all_modules != "yes"))
def _df2modules(args): ext = os.path.splitext(args.module_fname.name)[1] adjacencies = pd.read_csv(args.module_fname.name, sep=FILE_EXTENSION2SEPARATOR[ext]) ex_mtx = _load_expression_matrix(args) return modules_from_adjacencies(adjacencies, ex_mtx, thresholds=args.thresholds, top_n_targets=args.top_n_targets, top_n_regulators=args.top_n_regulators, min_genes=args.min_genes, keep_only_activating=(args.all_modules != "yes"))
def _df2modules(args): ext = os.path.splitext(args.module_fname.name)[1] adjacencies = pd.read_csv(args.module_fname.name, sep=FILE_EXTENSION2SEPARATOR[ext]) ex_mtx = pd.read_csv(args.expression_mtx_fname, sep='\t', header=0, index_col=0) return modules_from_adjacencies(adjacencies, ex_mtx, nomenclature=args.nomenclature, thresholds=args.thresholds, top_n_targets=args.top_n_targets, top_n_regulators=args.top_n_regulators, min_genes=args.min_genes)
def run_regression(self): data_df = self.data.to_df() utils.Debug.vprint( "Calculating {m} adjacencies".format(m=self.adjacency_method), level=0) # Get adjacencies adj_method = ADJ_METHODS[self.adjacency_method] if MPControl.is_dask: client_or_address = MPControl.client.client MPControl.client.check_cluster_state() else: client_or_address = 'local' adjacencies = adj_method(data_df, tf_names=self.tf_names, verbose=True, client_or_address=client_or_address, seed=self.random_seed) if self.do_scenic: # Convert adjacencies to modules modules = list(modules_from_adjacencies(adjacencies, data_df)) # Load feather (rank) databases dbs = [ RankingDatabase(fname=self._feather_rank_file, name="RANKING_PRIOR") ] utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0) # Prune to df df = prune2df(dbs, modules, self._motif_link_table_file, client_or_address=client_or_address) return self.reprocess_scenic_output_to_inferelator_results( df, self.priors_data) else: return self.reprocess_adj_to_inferelator_results(adjacencies)
def calcTFs( expr, tf_names, db, prefix, motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl', out_path='../data/pySCENIC', ppn=8): """Computes motifs, regulons and trancriptional factor activation using pySCENIC. Arguments --------- expr: `pandas DataFrame` cell X gene raw counts; FPKM; not TPM as coexpression will be calculated tf_names: `list` (`str`) curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt db: `list` (`FeatherRankingDatabase()`) feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")] prefix: `str` (default: `None`) Specify name to save files (eg, cell line names) Returns ------- Do not return but write files (the calc takes too long...) """ # Inference of co-expression modules adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True) modules = list(modules_from_adjacencies(adjacencies, expr)) # Calculate a list of enriched motifs and the corresponding target genes for all modules. with ProgressBar(): df = prune2df(db, modules, motif_path, num_workers=ppn) # Create regulons from this table of enriched motifs. regulons = df2regulons(df) # Save the enriched motifs and the discovered regulons to disk. with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f: pickle.dump(regulons, f) auc_mtx = aucell(expr, regulons, num_workers=ppn) tfs = [tf.strip('(+)') for tf in auc_mtx.columns] auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix)) print('finished calculation for %s' % (prefix))
os.chdir(data_folder_iter) ## Run GRNBoost2 (faster equivalent of GENIE3) from arboreto to infer co-expression modules if not os.path.isfile(network_fname): adjacencies = grnboost2(data_train, tf_names=tf_names, verbose=True, client_or_address=custom_client, seed=i) adjacencies.to_csv(network_fname, sep=',', header=True, index=False, compression='gzip') else: adjacencies = pd.read_csv(network_fname) ## Derive potential regulons from co-expression modules if not os.path.isfile(modules_fname): modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False)) pickle.dump(modules, open(modules_fname, 'wb')) else: modules = pickle.load(open(modules_fname, 'rb')) del adjacencies ## Prune modules for targets with cis regulatory footprints (aka RcisTarget) ### Calculate a list of enriched motifs and the corresponding target genes for all modules. if not os.path.isfile(motifs_fname): df = prune2df(dbs, modules, motif_annotations, num_workers=n_cores) df.to_csv(motifs_fname) else:
databases_glob = os.path.join( "mm10__*.feather") # loads cisTarget databases into memory db_fnames = glob.glob(databases_glob) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] # GENIE3 process: returns co-expression modules adjacencies = grnboost2( ex_matrix, tf_names=tf_names, verbose=True) # runs improved GRNBoost instance of GENIE3 modules = list(modules_from_adjacencies( adjacencies, ex_matrix)) # identifies modules from GENIE3 # save GRNBoost2 product so we don't have to repeat again adjacencies.to_csv("grnboost_output.csv") # load product in case something goes wrong adjacencies = pd.read_csv("grnboost_output.csv", index_col=0) # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS with ProgressBar( ): # calculate a list of enriched motifs and the corresponding target genes for all modules df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt") regulons = df2regulons( df) # create regulons from this table of enriched motifs # save the discovered motifs and regulons
RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] print(dbs) print("running grnboost") print("tf_names head") print(tf_names[1:5]) #print("gene names head") #print(ex_matrix.iloc[1:5,1:5]) adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True) adjacencies.head() print("identify modules") adjacencies.to_csv(out_file, sep='\t', index=False, header=False) print("grnboost done") modules = list( modules_from_adjacencies(adjacencies, ex_matrix, rho_mask_dropouts=True)) #print("writing modules") #with open(MODULES_FNAME, 'wb') as f: # pickle.dump(modules, f) print("Finding Enriched modules") # Calculate a list of enriched motifs and the corresponding target genes for all modules. with ProgressBar(): df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME) df.head() # Create regulons from this table of enriched motifs. print("creating regulons")
client_or_address=client) print("DEFINED adjacencies, type and head:") adjacencies.to_csv(ADJACENCIES_FNAME, sep='\t') #load adjacencies adjacencies = pd.read_csv(ADJACENCIES_FNAME, sep='\t', header=0, index_col=0) print("READ IN adjacencies, type and head:") print(type(adjacencies)) print(adjacencies.head()) modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) print("DEFINED modules, type:") #write modules in a file as binary and as text with open(MODULES_BIN_FNAME, "wb") as f: pickle.dump(modules, f) modules_txt = open(MODULES_FNAME, "w") modules_txt.write(str(modules)) modules_txt.close() #read in modules with open(MODULES_BIN_FNAME, "rb") as f: modules = pickle.load(f) print("LOADED modules, type:")
#3. ranking databases (only 2 mm10 dbs) l_fname=list(Path(fd_db).glob('*.feather')) l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname] #3. run if __name__ =='__main__': # #1. Inference of co-expression modules # print('Inference...') # df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True) # df_adj.to_csv(f'{fd_out}/adj.csv', index=False) #2. prune df_adj=pd.read_csv(f'{fd_out}/adj.csv') #if missing, always stuck at 98% print('Prune...') l_mod=list(modules_from_adjacencies(df_adj, df_cnt)) with ProgressBar(): df_prune = prune2df(l_db, l_mod, f_motif) df_prune.to_csv(f'{fd_out}/prune.csv') #3. create regulon print('Regulon...') regulon=df2regulons(df_prune) #4. Save the enriched motifs and the discovered regulons with open(f'{fd_out}/regulon.pkl', "wb") as f: pickle.dump(regulon, f) #5. auc print('AUC...')