def find_adjacencies_command(args): """ Infer co-expression modules. """ LOGGER.info("Loading expression matrix.") ex_mtx = _load_expression_matrix(args) tf_names = load_tf_names(args.tfs_fname.name) n_total_genes = len(ex_mtx.columns) n_matching_genes = len(ex_mtx.columns.isin(tf_names)) if n_total_genes == 0: LOGGER.error("The expression matrix supplied does not contain any genes. Make sure the extension of the file matches the format (tab separation for TSV and comma sepatration for CSV).") sys.exit(1) if float(n_matching_genes)/n_total_genes < 0.80: LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.") LOGGER.info("Inferring regulatory networks.") client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers) try: network = grnboost2(expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client) finally: shutdown_callback(False) LOGGER.info("Writing results to file.") network.to_csv(args.output, index=False, sep='\t')
def find_adjacencies_command(args): """ Infer co-expression modules. """ LOGGER.info("Loading expression matrix.") try: ex_mtx = load_exp_matrix( args.expression_mtx_fname.name, (args.transpose == 'yes'), args.sparse, args.cell_id_attribute, args.gene_attribute, ) except ValueError as e: LOGGER.error(e) sys.exit(1) tf_names = load_tf_names(args.tfs_fname.name) if args.sparse: n_total_genes = len(ex_mtx[1]) n_matching_genes = len(ex_mtx[1].isin(tf_names)) else: n_total_genes = len(ex_mtx.columns) n_matching_genes = len(ex_mtx.columns.isin(tf_names)) if n_total_genes == 0: LOGGER.error( "The expression matrix supplied does not contain any genes. " "Make sure the extension of the file matches the format (tab separation for TSV and " "comma sepatration for CSV)." ) sys.exit(1) if float(n_matching_genes) / n_total_genes < 0.80: LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.") LOGGER.info("Inferring regulatory networks.") client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers) method = grnboost2 if args.method == 'grnboost2' else genie3 try: if args.sparse: network = method( expression_data=ex_mtx[0], gene_names=ex_mtx[1], tf_names=tf_names, verbose=True, client_or_address=client, seed=args.seed, ) else: network = method( expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client, seed=args.seed ) finally: shutdown_callback(False) LOGGER.info("Writing results to file.") extension = PurePath(args.output.name).suffixes network.to_csv(args.output.name, index=False, sep=suffixes_to_separator(extension))
def run(cfg_fname): # Read configuration file. cfg = ConfigParser() cfg.read(cfg_fname) # Set logging level. logging_debug_opt = cfg["params"]["debug"].lower().strip() in { "yes", "true", "y" } LOGGER.addHandler(create_logging_handler(logging_debug_opt)) LOGGER.setLevel(logging.DEBUG) # Derive file names. #mtx_fnames = list(mapcat(glob.glob, cfg['data']['mtx_fnames'].split(";"))) mtx_fnames = glob.glob(cfg['data']['mtx_fnames']) tfs = load_tf_names(cfg['data']['tfs_fname']) # Derive cluster information. not_cluster_ip = 'scheduler_ip' not in cfg['params'] if not_cluster_ip: local_cluster = LocalCluster(n_workers=int(cfg['params']['num_cores']), threads_per_worker=1) client = Client(local_cluster) else: class DummyClient: def close(self): pass local_cluster = DummyClient() client = cfg['params']['scheduler_ip'] # Remove fnames that already have a corresponding results file. def add_output(fname, out_folder): basename = os.path.basename(fname).split('.')[0] return fname, os.path.join(out_folder, "{}.net.csv".format(basename)) out_folder = cfg['data']['out_folder'] for in_fname, out_fname in filter( lambda t: not os.path.exists(t[1]), map(partial(add_output, out_folder=out_folder), mtx_fnames)): LOGGER.info("Running GRNboost for {}.".format(in_fname)) try: process(in_fname, tfs, out_fname, client) except ValueError as e: LOGGER.error( "Unable to process {} because of \"{}\". Stacktrace:".format( in_fname, str(e))) LOGGER.error(traceback.format_exc()) if not_cluster_ip: client.close() local_cluster.close() print("{} - Done.".format(datetime.datetime.now()))
def run_algo(client, algo_name, seed_value): if algo_name == 'genie3': inf_algo = genie3 elif algo_name == 'grnboost2': inf_algo = grnboost2 else: raise ValueError('Houston, we have a problem between desk and chair.. ({})'.format(algo_name)) scaler = StandardScaler() for network_name, exp_path, tfs_path in datasets: start_time = time.time() print('inferring {0} with seed {1}'.format(network_name, seed)) exp_matrix = pd.read_csv(exp_path, sep='\t') scaled_values = scaler.fit_transform(exp_matrix) exp_matrix_scaled = pd.DataFrame(scaled_values, columns=exp_matrix.columns) tf_names = load_tf_names(tfs_path) network_df = inf_algo(client_or_address=client, expression_data=exp_matrix_scaled, tf_names=tf_names, seed=seed_value, limit=100000) inf_time = time.time() delta_time = inf_time - start_time print('inferred {0} with seed {1} in {2} seconds'.format(network_name, seed, str(delta_time))) network_out_path = '{0}{1}.seed_{2}.csv'.format(out_dir, network_name, seed) network_df.to_csv(network_out_path, sep='\t', index=None, header=None) print('{0} with seed {1} written to {2}'.format(network_name, seed, network_out_path))
DATABASE_FOLDER = "databases" DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.mc9nr.feather") MOTIF_ANNOTATIONS_FNAME = os.path.join( DATABASE_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl") MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_curated_tfs.txt') SC_EXP_FNAME = os.path.join(DATA_FOLDER, (inputFilename + ".tsv")) MODULES_FNAME = os.path.join(RESULT_FOLDER, ("modules_" + inputFilename + ".p")) REGULONS_FNAME = os.path.join(RESULT_FOLDER, "regulons_" + inputFilename + ".p") MOTIFS_FNAME = os.path.join(RESULT_FOLDER, "motifs_" + inputFilename + ".csv") ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T print(ex_matrix.shape) tf_names = load_tf_names(MM_TFS_FNAME) print("tf names loaded") out_file = os.path.join(RESULT_FOLDER, "grn_output_" + inputFilename + ".tsv") db_fnames = glob.glob(DATABASES_GLOB) def name(fname): return os.path.splitext(os.path.basename(fname))[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] print(dbs) print("running grnboost") print("tf_names head") print(tf_names[1:5])
BINARYAUC_FNAME = os.path.join(OUT_FOLDER, (ASSAYID + "_binary_AUC.csv")) BINARYTHR_FNAME = os.path.join(OUT_FOLDER, (ASSAYID + "_binary_thresholds.csv")) NOMENCLATURE = "HGNC" print("FINISHED DECLARING CONSTANTS") #-----------load data----------------------------------------------------------------- #Load filtered expression matrix ex_matrix = pd.read_csv(SC_EXP_FILT_FNAME, sep='\t', header=0, index_col=0) ex_matrix = ex_matrix.T print("LOADED ex_matrix") #load TF names tf_names = load_tf_names(HG_TFS_FNAME) print("FIRST 10 TF NAMES:") print(tf_names[0:10]) #load ranking databases db_fnames = glob.glob(FEATHER_GLOB) #print("DATABASE FILE NAMES:") print(db_fnames) def name(fname): return os.path.basename(fname).split(".")[0] #dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature=NOMENCLATURE) for fname in db_fnames] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
#parameter cell_use='_Interneuron_classA_0.1' input_file=cell_use+'_matrix.csv' output_grnboost2=cell_use+'_grnboost2.csv' output_genie3=cell_use+'_genie3.csv' output_grnboost2_txt=cell_use+'_grnboost2.txt' output_genie3_txt=cell_use+'_genie3.txt' #load data ex_matrix=pd.read_csv(input_file,sep=',') ex_matrix=np.transpose(ex_matrix) tf_names=load_tf_names('mm_mgi_tfs.txt') #infer the gene regulatory network network_n=grnboost2(ex_matrix, tf_names=tf_names, verbose=True) network_g=genie3(ex_matrix, tf_names=tf_names, verbose=True) #for following igraph analysis network_n.to_csv(output_grnboost2,sep='\t') network_g.to_csv(output_genie3,sep='\t') #txt file with no header and index for FAC calculation network_n.to_csv(output_grnboost2_txt,sep='\t',header=False,index=False) network_g.to_csv(output_genie3_txt,sep='\t',header=False,index=False)
print('reading expression matrix from "{}"'.format(args.i)) expression_matrix = pd.read_csv(args.i, sep='\t') print('expression matrix shape: {}'.format(str(expression_matrix.shape))) em_time = time.time() print('expression matrix read in {} seconds\n'.format(em_time - start_time)) # -------------------------- # # READ TRANSCRIPTION FACTORS # # -------------------------- # print('reading transcription factors from "{}"'.format(args.tf)) tf_names = load_tf_names(args.tf) gene_names = expression_matrix.columns tfs_in_matrix = set(tf_names).intersection(set(gene_names)) print('{} transcription factors in common with expression matrix\n'.format( str(len(tfs_in_matrix)))) # ------------- # # INFER NETWORK # # ------------- # if args.genie3: inf_algo = genie3 inf_algo_name = 'GENIE3' else: inf_algo = grnboost2
## Load randing databases db_fnames = glob.glob(db_folder) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames] dbs ## Initialize cluster local_cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, processes=False, memory_limit=memory_limit) custom_client = Client(local_cluster) ## Load TFs tf_names = load_tf_names(TFs_file) ## Collect here regulons passing correlation filter cortest_passed_regulons = [] for i in range(0, iterations): ## Split to train and test data[grouping_variable] = metadata[grouping_variable] data_sampled = data.groupby(grouping_variable).apply(lambda x: x.sample(n=min(n_cells, len(x)), random_state=i)) data_sampled.index = data_sampled.index.get_level_values(1) data_train = data_sampled.groupby(grouping_variable).apply(lambda x: x.sample(n=round(len(x) * train_pct), random_state=i)) data_train.index = data_train.index.get_level_values(1) data_test = data_sampled[~data_sampled.index.isin(data_train.index)]
import os import numpy as np from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 data_dir = '/home/brad/data2/rstudio/birds/scRNA/devin_combined/finch_cells/grn/export_to_numpy_glut' expr_fname = os.path.join(data_dir, '1.1_exprMatrix_filtered_t.txt') tf_fname = os.path.join(data_dir, '1.1_inputTFs.txt') if __name__ == '__main__': # ex_matrix is a numpy ndarray, which has no notion of column names ex_matrix = np.genfromtxt(expr_fname, delimiter='\t', skip_header=1) # we read the gene names from the first line of the file with open(expr_fname) as file: gene_names = [gene.strip() for gene in file.readline().split('\t')] # sanity check to verify the ndarray's nr of columns equals the length of the gene_names list assert ex_matrix.shape[1] == len(gene_names) # tf_names is read using a utility function included in Arboreto tf_names = load_tf_names(tf_fname) network = grnboost2(expression_data=ex_matrix, gene_names=gene_names, # specify the gene_names tf_names=tf_names) network.to_csv('output.tsv', sep='\t', index=False, header=False)
threads_per_worker=1, memory_limit=8e9) custom_client = Client(local_cluster) print(custom_client) #### get expression matrix ex_path = indir + 'log_CPM_matrix.tsv' print('reading expression data: ' + ex_path) ex_matrix = pd.read_csv(ex_path, index_col=0, sep='\t', skiprows=1, header=None).T #### get tfs tf_path = '/ddn1/vol1/staging/leuven/stg_00002/lcb/kspan/analyses/ThreeLines10xSCENIC2/hg19_allTFs.lst' tf_names = load_tf_names(tf_path) tf_names = list(set(tf_names).intersection(ex_matrix.columns)) print(len(tf_names)) #run grnboost2 outfile = indir + 'grnboost2.tsv' print('grnboost2 results will be printed to: ' + outfile) start_time = time.time() network = grnboost2(expression_data=ex_matrix, tf_names=tf_names, client_or_address=custom_client, verbose=True) print(time.time() - start_time, "seconds") print(network.head()) network.to_csv(outfile, sep='\t', index=False, header=False) sys.stdout = saveout
if __name__ == "__main__": import os import glob import pickle import pandas as pd import numpy as np #from dask.diagnostics import ProgressBar from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 #from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase #from pyscenic.utils import modules_from_adjacencies, load_motifs #from pyscenic.prune import prune2df, df2regulons #from pyscenic.aucell import aucell #from pyscenic.binarization import binarize ex_matrix = pd.read_csv(snakemake.input[0], sep='\t', header=0, index_col=0).T tf_names = load_tf_names("resources/tfs.txt") adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True) with open(snakemake.output[0], "wb") as f: pickle.dump(adjacencies, f)
#get df count adata=sc.read(f_cnt) df_cnt=pd.DataFrame(adata.X.toarray(), index=adata.obs.index, columns=adata.var.index) #---------------functions----------------- def name(fname): return os.path.splitext(os.path.basename(fname))[0] ################################################################# ##1. load df (already loaded in setup) #df_cnt=pd.read_csv(f_in, index_col=0) #2. tf genes tf_name=load_tf_names(f_tf) #3. ranking databases (only 2 mm10 dbs) l_fname=list(Path(fd_db).glob('*.feather')) l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname] #3. run if __name__ =='__main__': # #1. Inference of co-expression modules # print('Inference...') # df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True) # df_adj.to_csv(f'{fd_out}/adj.csv', index=False) #2. prune df_adj=pd.read_csv(f'{fd_out}/adj.csv') #if missing, always stuck at 98% print('Prune...')
]].drop_duplicates().dropna() # cleans raw TF annotations tfs["ID"] = list(map(int, tfs["Gene ID"])) # lists the genes by ID conv_tfs = pd.read_csv( "TF_conversion.txt", delimiter="\t") # imports in MGI gene names from DAVID def extract_symbol(name): # extracts the abbreviated ID from full name s_idx = name.rfind('(') e_idx = name.rfind(')') return name[s_idx + 1:e_idx] conv_tfs["Gene Name"].apply(extract_symbol).to_csv( TF_list_filename, index=False) # turns list into CSV for future import # this cell loads TF list, expression matrix, and databases for downstream SCENIC analysis tf_names = load_tf_names( TF_list_filename) # if TF list has been made, this imports it ex_matrix = pd.read_csv( "GENIE3_import.csv", header=0, index_col=0).T # loads expression matrix, make sure you transpose back databases_glob = os.path.join( "mm10__*.feather") # loads cisTarget databases into memory db_fnames = glob.glob(databases_glob) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] # GENIE3 process: returns co-expression modules
wd = '/home/pezoldt/NAS2/pezoldt/Analysis/scRNAseq/scenic/' + sample_ID + '/' + cell_type + '/int' #Set directories for TF list and expMat data net1_ex_path = wd + '/1.1_exprMatrix_filtered_t.txt' net1_tf_path = wd + '/1.2_inputTFs.txt' #Load data ex_matrix = pd.read_csv(net1_ex_path, sep='\t') #shape of matrix ex_matrix.shape #head of matrix ex_matrix.head() #load TF list from file tf_names = load_tf_names(net1_tf_path) #Quick inspection tf_names[:5] len(tf_names) #Set computational local environment # Obersvation: Less Ascertion errors if run with less people on cluster from distributed import LocalCluster, Client local_cluster = LocalCluster(n_workers=6, threads_per_worker=1) custom_client = Client(local_cluster) custom_client #Start Job network = grnboost2(expression_data=ex_matrix, tf_names=tf_names, client_or_address=custom_client)
sys.stderr.write("\nStarting Dusk cluster...") local_cluster = LocalCluster(n_workers=32, threads_per_worker=1, memory_limit=8e10) custom_client = Client(local_cluster) sys.stderr.write("done.\n") # ex_matrix is a DataFrame with gene names as column names sys.stderr.write("\nReading count matrix...") ex_matrix = pd.read_csv(in_file, sep='\t', index_col=0, header=None).T sys.stderr.write("done.\n") # tf_names is read using a utility function included in Arboreto sys.stderr.write("\nLoading putative transcription factors...") tf_names = load_tf_names(tf_file) sys.stderr.write("done.\n") sys.stderr.write("\nPredicting co-expression network in chunks...\n") i = 0 for chunk in grouper(tf_names, 20): sys.stderr.write("Working on chunk %s\n" % str(i)) network = grnboost2(expression_data=ex_matrix, tf_names=chunk, client_or_address=custom_client) network.to_csv("network_reddien_" + str(i) + ".csv", sep=",", header=False, index=False) i += 1 sys.stderr.write("done.\n")
import pandas as pd import numpy as np #from dask.diagnostics import ProgressBar from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 #from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase #from pyscenic.utils import modules_from_adjacencies, load_motifs #from pyscenic.prune import prune2df, df2regulons #from pyscenic.aucell import aucell #from pyscenic.binarization import binarize ex_matrix = pd.read_csv(snakemake.input[0], sep='\t', header=0, index_col=0).T tf_names = load_tf_names("resources/network_analysis/mm_mgi_tfs.txt") print(ex_matrix.shape) print("finish loading, now do grn") adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True) print("grn_complete, now dumping") with open(snakemake.output[0], "wb") as f: pickle.dump(adjacencies, f)
if args.sparse: ex_matrix = ds.layers[''].sparse().T.tocsc() gene_names = pd.Series(ds.ra[args.gene_attribute]) #cells = ds.ca[args.cell_id_attribute] else: ex_matrix = pd.DataFrame(data=ds[:, :], index=ds.ra[args.gene_attribute], columns=ds.ca[args.cell_id_attribute]).T gene_names = pd.Series(ds.ra[args.gene_attribute]) end_time = time.time() print( f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stderr) tf_names = load_tf_names(args.tfs_fname.name) print(f'Loaded {len(tf_names)} TFs...', file=sys.stderr) ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names) tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names) print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stderr) start_time = time.time() with Pool(args.num_workers) as p: adjs = list( tqdm.tqdm(p.imap(run_infer_partial_network, target_gene_indices(gene_names,
print('finished calculation for %s' % (prefix)) if __name__ == "__main__": wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes' os.chdir(wkdir) db = [ RankingDatabase( fname= '../data/pySCENIC/ref/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather', name='hg38__refseq-r80__10kb_up_and_down_tss.mc9nr') ] tf_names = load_tf_names('../data/pySCENIC/ref/hs_hgnc_curated_tfs.txt') CellTypes = [ 'MCF7', 'T47D WT', 'T47D KO', 'MM134', 'SUM44', 'BCK4', 'MCF10A', 'HEK293' ] for cell in CellTypes: tmp = adata_raw[adata_raw.obs['CellType'] == cell] RawSplicedCts = pd.DataFrame(tmp.layers['spliced'].todense(), index=tmp.obs.index, columns=tmp.var.index) # cell X gene print(cell, RawSplicedCts.shape) if not isfile('{}/{}_auc_mtx.csv'.format('../data/pySCENIC', cell)):