def main(transcriptome_file, regulator_file, species, out_file, n_workers, threads_per_worker): print('reading data') tf_info = pd.read_csv(regulator_file, sep='\t', index_col=0) tf_names = list(tf_info.loc[tf_info['Species'] == species].index) df = pd.read_csv(transcriptome_file, sep='\t', index_col=0) num_not_expressed = (df.std(axis=1) == 0).sum() print( f'removing {num_not_expressed} genes that have zero expression in all samples' ) #Filter genes that are not expressed df = df.loc[df.std(axis=1) > 0] print('starting scheduler') client = Client(n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit='48GB') try: network = grnboost2(expression_data=df.T, tf_names=tf_names, client_or_address=client, verbose=True) network.to_csv(out_file, sep='\t', header=False, index=False) except Exception as e: print('Module inference error') print(e) finally: client.close()
def find_adjacencies_command(args): """ Infer co-expression modules. """ LOGGER.info("Loading expression matrix.") ex_mtx = _load_expression_matrix(args) tf_names = load_tf_names(args.tfs_fname.name) n_total_genes = len(ex_mtx.columns) n_matching_genes = len(ex_mtx.columns.isin(tf_names)) if n_total_genes == 0: LOGGER.error("The expression matrix supplied does not contain any genes. Make sure the extension of the file matches the format (tab separation for TSV and comma sepatration for CSV).") sys.exit(1) if float(n_matching_genes)/n_total_genes < 0.80: LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.") LOGGER.info("Inferring regulatory networks.") client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers) try: network = grnboost2(expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client) finally: shutdown_callback(False) LOGGER.info("Writing results to file.") network.to_csv(args.output, index=False, sep='\t')
def helper_grnboost2(X, theta_true, tf_names=[], BEELINE=False): #_string print('Running GRNBoost2 method', X.shape) theta_true = theta_true.real ex_matrix = pd.DataFrame(X) if args.USE_TF_NAMES == 'yes' and len(tf_names) != 0: tf_names = ['G' + str(n) for n in tf_names] else: tf_names = None gene_names = ['G' + str(c) for c in ex_matrix.columns] ex_matrix.columns = gene_names network = grnboost2(expression_data=ex_matrix, gene_names=gene_names, tf_names=tf_names) #, verbose=True) pred_edges = np.array(network[['TF', 'target', 'importance']]) G_pred = nx.Graph() # G_pred.add_nodes_from(['G'+str(n) for n in range(args.D)]) G_pred.add_nodes_from(['G' + str(n) for n in range(len(gene_names))]) G_pred.add_weighted_edges_from(pred_edges) # pred_theta = nx.adj_matrix(G_pred).todense() + np.eye(args.D) pred_theta = nx.adj_matrix(G_pred).todense() + np.eye(len(gene_names)) recovery_metrics = report_metrics(np.array(theta_true), np.array(pred_theta)) print( 'GRNBOOST2: FDR, TPR, FPR, SHD, nnz_true, nnz_pred, precision, recall, Fb, aupr, auc' ) print('GRNBOOST2: Recovery of true theta: ', *np.around(recovery_metrics, 3)) res = list(recovery_metrics) return res
def process(mtx_fname, tfs, net_fname, client): network = grnboost2(expression_data=pd.read_csv(mtx_fname, sep='\t', index_col=0).T, tf_names=tfs, verbose=True, client_or_address=client) network.to_csv(net_fname, index=False)
def main(args): opts, args = parseArgs(args) inDF = pd.read_csv(opts.inFile, sep='\t', index_col=0, header=0) client = Client(processes=False) if opts.algo == 'GENIE3': network = genie3(inDF, client_or_address=client) network.to_csv(opts.outFile, index=False, sep='\t') elif opts.algo == 'GRNBoost2': network = grnboost2(inDF, client_or_address=client) network.to_csv(opts.outFile, index=False, sep='\t') else: print("Wrong algorithm name. Should either be GENIE3 or GRNBoost2.")
def calcTFs( expr, tf_names, db, prefix, motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl', out_path='../data/pySCENIC', ppn=8): """Computes motifs, regulons and trancriptional factor activation using pySCENIC. Arguments --------- expr: `pandas DataFrame` cell X gene raw counts; FPKM; not TPM as coexpression will be calculated tf_names: `list` (`str`) curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt db: `list` (`FeatherRankingDatabase()`) feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")] prefix: `str` (default: `None`) Specify name to save files (eg, cell line names) Returns ------- Do not return but write files (the calc takes too long...) """ # Inference of co-expression modules adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True) modules = list(modules_from_adjacencies(adjacencies, expr)) # Calculate a list of enriched motifs and the corresponding target genes for all modules. with ProgressBar(): df = prune2df(db, modules, motif_path, num_workers=ppn) # Create regulons from this table of enriched motifs. regulons = df2regulons(df) # Save the enriched motifs and the discovered regulons to disk. with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f: pickle.dump(regulons, f) auc_mtx = aucell(expr, regulons, num_workers=ppn) tfs = [tf.strip('(+)') for tf in auc_mtx.columns] auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix)) print('finished calculation for %s' % (prefix))
def inferGRN(filename, libpath, libname, lib_both=True, savedir=None, suffix=None, seed=None): """ Top-level script for inferring gene regulatory network from a given dataset using the Arboreto GRNboost2 algorithm. :filename: path to CSV file containing gene expression data. :libpath: path to directory containing sub-folders for TF-target libraries. :libname: string of TF-target library used for inference. :lib_both: (optional) Boolean operator determining use of additional library (TRANSFACpredicted) for wider TF coverage :savedir: (optional) path to directory for saving final CSV. :seed: (optional) integer for inference algorithm seed """ # import cpm + library data cpm = importData(filename) cpm_array, cpm_genes = processData(cpm) tf_all = importTFs(libpath, libname, lib_both) tf_names = tf_all["GeneSym"].to_list() # setup Dask cluster client = Client(LocalCluster()) print(client.dashboard_link) # infer + refine GRN grn = grnboost2(expression_data=cpm_array, gene_names=cpm_genes, tf_names=tf_names, client_or_address=client, seed=seed) grn_refined = refineGRN(grn, libname, dir_path=libpath) if savedir is not None: saveGRN(grn_refined, savedir, suffix=suffix) client.shutdown() return grn_refined
def main(transcriptome_file, regulator_file, species, out_file_prefix, n_random_samples, n_runs, n_workers, threads_per_worker): print('reading data') tf_info = pd.read_csv(regulator_file, sep = '\t', index_col = 0) tf_names = list(tf_info.loc[tf_info['Species'] == species].index) df = pd.read_csv(transcriptome_file, sep = '\t', index_col = 0) print('starting scheduler') client = Client(n_workers = n_workers, threads_per_worker = threads_per_worker, memory_limit='128GB') for i in range(n_runs): out_file = f'{out_file_prefix}_{i}.tsv' subsampled_df = df.sample(n_random_samples, axis = 1, random_state = i) #Filter genes that are not expressed num_not_expressed = (subsampled_df.std(axis = 1) == 0).sum() print( f'removing {num_not_expressed} genes that have zero', 'expression in all samples' ) subsampled_df = subsampled_df.loc[df.std(axis = 1) > 0] try: network = grnboost2(expression_data = subsampled_df.T, tf_names = tf_names, client_or_address = client, verbose = True) network.to_csv(out_file, sep = '\t', header = False, index = False) except Exception as e: print('Module inference error') print(e) client.close()
"grn_output_" + inputFilename + ".tsv") db_fnames = glob.glob(DATABASES_GLOB) def name(fname): return os.path.splitext(os.path.basename(fname))[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] print(dbs) print("running grnboost") print("tf_names head") print(tf_names[1:5]) #print("gene names head") #print(ex_matrix.iloc[1:5,1:5]) adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True) adjacencies.head() print("identify modules") adjacencies.to_csv(out_file, sep='\t', index=False, header=False) print("grnboost done") modules = list( modules_from_adjacencies(adjacencies, ex_matrix, rho_mask_dropouts=True)) #print("writing modules") #with open(MODULES_FNAME, 'wb') as f: # pickle.dump(modules, f) print("Finding Enriched modules") # Calculate a list of enriched motifs and the corresponding target genes for all modules.
import os import pandas as pd import argparse from dask.distributed import Client from distributed import Client, LocalCluster if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on') parser.add_argument('--name', nargs=1, type=str, help='name of dataset') args = parser.parse_args() cl = args.cell_line[0] name = args.name[0] from arboreto.algo import grnboost2, genie3 from arboreto.utils import load_tf_names ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose() cluster = LocalCluster() client = Client(cluster) print('here') network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client) network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False) client.close() cluster.close()
# add 'G' for entrezgene id dir=/path/to/your/file ############################################################################# import os, sys, re, gc import pandas as pd os.chdir(dir) from arboreto.algo import grnboost2, genie3 from arboreto.utils import load_tf_names ex_matrix = pd.read_csv("feed2python.csv",index_col=0) matrix = ex_matrix.T # tf_names = load_tf_names("ChIPBaseV2_regNet_geo.csv") df= pd.read_csv("regNet_tf.csv") df2 = df.columns.get_values() df2.tolist() tf_names= df2[1:].tolist() tf_names= [re.sub("X", "G", x) for x in tf_names] # print(matrix.head(3)) network = grnboost2(expression_data=matrix, tf_names=tf_names) network.to_csv('ex_GRNboost2_network.tsv', sep='\t', header=False, index=False) # release the memory from python del ex_matrix, matrix, df, df2, tf_names, network gc.collect()
genes_to_use.add(target) import numpy as np from arboreto.algo import grnboost2, genie3 from sklearn.decomposition import PCA pca = PCA(n_components=1) if __name__ == '__main__': pcafile = open('pc_fraction_explained.log', 'w') data_to_use = Normal_Data.loc[list(genes_to_use), :].T pca.fit(data_to_use) pca_explained = pca.explained_variance_ratio_[0] pcafile.write(str(pca_explained) + '\n') TFs_to_use = [gene for gene in genes_to_use if gene in TFs] network = grnboost2(expression_data=data_to_use, tf_names=list(TFs_to_use)) print(network.head()) network.to_csv('networkfiles/biologicalnetwork.log', sep='\t', index=False, header=False) original_data = Normal_Data.loc[list(genes_to_use), :] for i in range(100): print(i) data_to_use = original_data.copy() for j in range(len(Normal_Samples)): l = list(original_data.iloc[:, j]) l = list(np.random.permutation(l)) data_to_use[Normal_Samples[j]] = l print(data_to_use.shape)
#------------Phase I: Inference of co-expression modules-------------------------------- #------------GRNBoost------------------------------------------------------------------- print("STARTING PHASE I") # Define cluster local_cluster = LocalCluster(n_workers=nCores, threads_per_worker=1) client = Client(local_cluster) print(client) N_SAMPLES = ex_matrix.shape[0] # Full dataset print(N_SAMPLES) adjacencies = grnboost2(expression_data=ex_matrix.sample(n=N_SAMPLES, replace=False), tf_names=tf_names, seed=123, verbose=True, client_or_address=client) print("DEFINED adjacencies, type and head:") adjacencies.to_csv(ADJACENCIES_FNAME, sep='\t') #load adjacencies adjacencies = pd.read_csv(ADJACENCIES_FNAME, sep='\t', header=0, index_col=0) print("READ IN adjacencies, type and head:") print(type(adjacencies)) print(adjacencies.head())
def run_grnboost2(Expr, filename='links.txt', gene_names=None, **kwargs): links = grnboost2(np.asmatrix(Expr.T), gene_names=gene_names, **kwargs) links.to_csv(filename, sep='\t', index=False, header=False)
def crossvalidateGRN(filename, libpath, libname, k, lib_both=True, savedir=None, suffix=None, seed=None): """ Top-level script for k-fold cross validation of gene regulatory network inference using the Arboreto GRNboost2 algorithm. :filename: path to CSV file containing gene expression data. :libpath: path to directory containing sub-folders for TF-target libraries. :libname: string of TF-target library used for inference. :k: integer specifying number of folds for CV :lib_both: (optional) Boolean operator determining use of additional library (TRANSFACpredicted) for wider TF coverage :savedir: (optional) path to directory for saving final CSV. :seed: (optional) integer for inference algorithm seed """ # import cpm + library data cpm = importData(filename) tf_all = importTFs(libpath, libname, lib_both) tf_names = tf_all["GeneSym"].to_list() # create and assign CV folds folds = gv.makeFolds(cpm, k) training, testing = gv.assignFolds(folds) # setup Dask cluster client = Client(LocalCluster()) print(client.dashboard_link) # infer + refine GRN for each fold fold = 0 while fold < k: cpm_fold = cpm.loc[:, training[fold]] cpm_array, cpm_genes = processData(cpm_fold) grn = grnboost2(expression_data=cpm_array, gene_names=cpm_genes, tf_names=tf_names, client_or_address=client, seed=seed) grn_refined = refineGRN(grn, libname, dir_path=libpath) if savedir is not None: saveGRN(grn_refined, savedir, fold=fold, suffix=suffix, trainingset=training, testingset=testing) # store all refined GRNs grn_refined["fold"] = fold if fold == 0: grn_all = grn_refined else: grn_all = grn_all.append(grn_refined) fold = fold + 1 client.shutdown() return grn_all
def run_boost(): return grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=custom_client)
local_cluster = LocalCluster(n_workers=32, threads_per_worker=1, memory_limit=8e10) custom_client = Client(local_cluster) sys.stderr.write("done.\n") # ex_matrix is a DataFrame with gene names as column names sys.stderr.write("\nReading count matrix...") ex_matrix = pd.read_csv(in_file, sep='\t', index_col=0, header=None).T sys.stderr.write("done.\n") # tf_names is read using a utility function included in Arboreto sys.stderr.write("\nLoading putative transcription factors...") tf_names = load_tf_names(tf_file) sys.stderr.write("done.\n") sys.stderr.write("\nPredicting co-expression network in chunks...\n") i = 0 for chunk in grouper(tf_names, 20): sys.stderr.write("Working on chunk %s\n" % str(i)) network = grnboost2(expression_data=ex_matrix, tf_names=chunk, client_or_address=custom_client) network.to_csv("network_reddien_" + str(i) + ".csv", sep=",", header=False, index=False) i += 1 sys.stderr.write("done.\n") sys.stderr.write("\n\n# All done\n")
index_col=0, sep='\t', skiprows=1, header=None).T #### get tfs tf_path = '/ddn1/vol1/staging/leuven/stg_00002/lcb/kspan/analyses/ThreeLines10xSCENIC2/hg19_allTFs.lst' tf_names = load_tf_names(tf_path) tf_names = list(set(tf_names).intersection(ex_matrix.columns)) print(len(tf_names)) #run grnboost2 outfile = indir + 'grnboost2.tsv' print('grnboost2 results will be printed to: ' + outfile) start_time = time.time() network = grnboost2(expression_data=ex_matrix, tf_names=tf_names, client_or_address=custom_client, verbose=True) print(time.time() - start_time, "seconds") print(network.head()) network.to_csv(outfile, sep='\t', index=False, header=False) sys.stdout = saveout logs.close() exit() # packages in environment at /data/leuven/306/miniconda3/envs/arboreto: { # Name Version Build Channel #arboreto 0.1.5 py_0 bioconda #blas 1.0 mkl #bokeh 0.13.0 py36_0 #ca-certificates 2018.03.07 0
counts = pd.DataFrame(counts, index=gene_info.index, columns=barcodes) from arboreto.algo import grnboost2, genie3 log_scaled_counts = ( np.log(counts.divide(counts.sum(axis=0), axis=1) * 10000 + 1)) log_scaled_counts = log_scaled_counts.loc[valid_genes, :] # About 14 minutes for 5000 genes x 3000 cells old_dir = os.getcwd() os.makedirs("/scratch/david.detomaso/temp", exist_ok=True) # Need this or else the workers time out os.chdir("/scratch/david.detomaso/temp") a = time.time() network = grnboost2(log_scaled_counts.T) b = time.time() print(b - a) os.chdir(old_dir) # Need to convert to long net_wide = network.pivot(index='TF', columns='target', values='importance') z = net_wide.fillna(0) z = z + z.T z.to_csv(out_file_scores, sep="\t", compression="gzip")
def test_launch_grnboost2(self): network_df = grnboost2(df, tf_names=tfs) self.assertGreater(len(network_df), 100)
df_cnt = pd.DataFrame(adata.X.toarray(), index=adata.obs.index, columns=adata.var.index) #2. tf genes tf_name = load_tf_names(f_tf) #3. ranking databases (only 2 mm10 dbs) l_fname = list(Path(fd_db).glob('*.feather')) l_db = [RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname] #3. run if __name__ == '__main__': #1. Inference of co-expression modules print('Inference...') df_adj = grnboost2(df_cnt, tf_names=tf_name, verbose=True) df_adj.to_csv(f'{fd_out}/adj_{sample}.csv', index=False) #2. prune df_adj = pd.read_csv( f'{fd_out}/adj_{sample}.csv') #if missing, always stuck at 98% print('Prune...') l_mod = list(modules_from_adjacencies(df_adj, df_cnt)) with ProgressBar(): df_prune = prune2df(l_db, l_mod, f_motif) df_prune.to_csv(f'{fd_out}/prune_{sample}.csv') #3. create regulon print('Regulon...') regulon = df2regulons(df_prune)
import pandas as pd from distributed import Client, LocalCluster from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 if __name__ == '__main__': in_file = 'net1_expression_data.tsv' tf_file = 'net1_transcription_factors.tsv' out_file = 'net1_grn_output.tsv' # ex_matrix is a DataFrame with gene names as column names ex_matrix = pd.read_csv(in_file, sep='\t') # tf_names is read using a utility function included in Arboreto tf_names = load_tf_names(tf_file) # instantiate a custom Dask distributed Client client = Client(LocalCluster()) # compute the GRN network = grnboost2(expression_data=ex_matrix, tf_names=tf_names, client_or_address=client) # write the GRN to file network.to_csv(out_file, sep='\t', index=False, header=False)
"GENIE3_import.csv", header=0, index_col=0).T # loads expression matrix, make sure you transpose back databases_glob = os.path.join( "mm10__*.feather") # loads cisTarget databases into memory db_fnames = glob.glob(databases_glob) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [ RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames ] # GENIE3 process: returns co-expression modules adjacencies = grnboost2( ex_matrix, tf_names=tf_names, verbose=True) # runs improved GRNBoost instance of GENIE3 modules = list(modules_from_adjacencies( adjacencies, ex_matrix)) # identifies modules from GENIE3 # save GRNBoost2 product so we don't have to repeat again adjacencies.to_csv("grnboost_output.csv") # load product in case something goes wrong adjacencies = pd.read_csv("grnboost_output.csv", index_col=0) # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS with ProgressBar( ): # calculate a list of enriched motifs and the corresponding target genes for all modules df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt") regulons = df2regulons(
input_file=cell_use+'_matrix.csv' output_grnboost2=cell_use+'_grnboost2.csv' output_genie3=cell_use+'_genie3.csv' output_grnboost2_txt=cell_use+'_grnboost2.txt' output_genie3_txt=cell_use+'_genie3.txt' #load data ex_matrix=pd.read_csv(input_file,sep=',') ex_matrix=np.transpose(ex_matrix) tf_names=load_tf_names('mm_mgi_tfs.txt') #infer the gene regulatory network network_n=grnboost2(ex_matrix, tf_names=tf_names, verbose=True) network_g=genie3(ex_matrix, tf_names=tf_names, verbose=True) #for following igraph analysis network_n.to_csv(output_grnboost2,sep='\t') network_g.to_csv(output_genie3,sep='\t') #txt file with no header and index for FAC calculation network_n.to_csv(output_grnboost2_txt,sep='\t',header=False,index=False) network_g.to_csv(output_genie3_txt,sep='\t',header=False,index=False) ####### done!
import os import numpy as np from arboreto.utils import load_tf_names from arboreto.algo import grnboost2 data_dir = '/home/brad/data2/rstudio/birds/scRNA/devin_combined/finch_cells/grn/export_to_numpy_glut' expr_fname = os.path.join(data_dir, '1.1_exprMatrix_filtered_t.txt') tf_fname = os.path.join(data_dir, '1.1_inputTFs.txt') if __name__ == '__main__': # ex_matrix is a numpy ndarray, which has no notion of column names ex_matrix = np.genfromtxt(expr_fname, delimiter='\t', skip_header=1) # we read the gene names from the first line of the file with open(expr_fname) as file: gene_names = [gene.strip() for gene in file.readline().split('\t')] # sanity check to verify the ndarray's nr of columns equals the length of the gene_names list assert ex_matrix.shape[1] == len(gene_names) # tf_names is read using a utility function included in Arboreto tf_names = load_tf_names(tf_fname) network = grnboost2(expression_data=ex_matrix, gene_names=gene_names, # specify the gene_names tf_names=tf_names) network.to_csv('output.tsv', sep='\t', index=False, header=False)
dt = pd.read_csv(item, index_col=0) dt.columns = [str(x) for x in dt.columns] print(dt) TFname = [ 'KLF6', 'TCEB3', 'LYL1', 'SMARCC1', 'TCOF1', 'ZNF267', 'ZEB2', 'MNDA', 'ETS2', 'BAZ2B', 'POU2F2', 'MEF2C', 'KDM5A', 'PDLIM7', 'HDGF', 'ZBTB16', 'ZNF350', 'STAT3', 'TAF1B', 'HIST2H2BE', 'DHX38', 'TP53', 'SMAD3', 'MXD4', 'ARID5B', 'USF2', 'KDM2A', 'HIVEP3', 'MYBL1', 'HIST1H1E', 'ZNF593', 'BATF', 'TAX1BP3', 'TRIM28', 'CBFB', 'CHD4', 'ZBTB38', 'PBX2', 'CTNNBIP1', 'SERTAD2', 'ZMYND11', 'NCOA4', 'PER1', 'ID3', 'POLR2A', 'CDKN1A', 'TGFB1', 'ZNF277', 'MAPK1', 'NEAT1', 'SP3', 'MAX', 'SMARCA2', 'REL', 'SIN3A', 'NR4A1', 'ASCL2', 'JUND', 'TFDP2', 'BHLHE40', 'NFKBIA', 'HTT', 'SOX4', 'SPI1', 'FOS', 'CITED2', 'CREM', 'PURA', 'HEXIM1', 'PKNOX1', 'CEBPB', 'HHEX', 'BRD8', 'RUNX3', 'MAFB', 'EOMES', 'SERTAD3', 'ZNF143', 'ZNF467', 'AKT1', 'ATF6', 'PTTG1', 'TBX21', 'UIMC1', 'IRF5', 'EED', 'ID1', 'IRF8', 'HOPX', 'SUGP2', 'JUN', 'TAF6L', 'PDLIM1', 'SPIB', 'HIST1H1C', 'RNF19A', 'CREBBP', 'IRF1', 'SUZ12', 'CHD8', 'HDAC5', 'BLZF1', 'SHPRH', 'CUX1', 'RELB', 'GTF3C1', 'FOSB', 'MLXIP', 'NFIC', 'IRF7', 'BBC3', 'GTF2I', 'MKL1', 'POLR1C', 'CEBPD', 'SMARCD2', 'IKZF3', 'SLA2' ] client = Client(processes=False) gene_name = list(dt.columns) print(gene_name) TFname = list(set(TFname) & set(gene_name)) print(dt) network = grnboost2(dt, client_or_address=client, gene_names=list(dt.columns), tf_names=TFname) network.to_csv(item + 'TF_grnboost2.csv')
network_fname = os.path.join(data_folder_iter, 'network.csv.gz') modules_fname = os.path.join(data_folder_iter, 'modules.p') motifs_fname = os.path.join(data_folder_iter, 'motifs.csv') regulons_fname = os.path.join(data_folder_iter, 'regulons.p') aucell_train_fname = os.path.join(data_folder_iter, 'aucell_train_scores.csv.gz') aucell_test_fname = os.path.join(data_folder_iter, 'aucell_test_scores.csv.gz') if not os.path.exists(data_folder_iter): os.makedirs(data_folder_iter) os.chdir(data_folder_iter) ## Run GRNBoost2 (faster equivalent of GENIE3) from arboreto to infer co-expression modules if not os.path.isfile(network_fname): adjacencies = grnboost2(data_train, tf_names=tf_names, verbose=True, client_or_address=custom_client, seed=i) adjacencies.to_csv(network_fname, sep=',', header=True, index=False, compression='gzip') else: adjacencies = pd.read_csv(network_fname) ## Derive potential regulons from co-expression modules if not os.path.isfile(modules_fname): modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False)) pickle.dump(modules, open(modules_fname, 'wb')) else: modules = pickle.load(open(modules_fname, 'rb')) del adjacencies
# download and unzip file url = 'https://tcga.xenahubs.net/download/TCGA.' + canType + '.sampleMap/HiSeqV2.gz' wget.download(url, 'cancer_data_TCGA.gz') with gzip.open('cancer_data_TCGA.gz', 'rb') as f_in: with open('file.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove("cancer_data_TCGA.gz") ############################################################################ df = pd.read_table("file.txt", index_col=0, sep='\t') #ex_matrix.columns.get_values() cols = [c for c in df.columns if c[13:15] == '03'] df = df[cols] matrix = df.T network = grnboost2(expression_data=matrix, tf_names=tf_names, verbose=True) network.to_csv(canType + '_ex_GRNboost2_network.tsv', sep='\t', header=False, index=False) # release the memory from python #del ex_matrix, matrix, df, tf_names, network os.remove('file.txt') shutil.rmtree('dask-worker-space') # end function