def ZmSAM2(Zm5bFGS): if cf.test.force.COB: tools.del_dataset("Expr", "ZmSAM2", force=True) if not tools.available_datasets("Expr", "ZmSAM2"): return co.COB.from_table( os.path.join( cf.options.testdir, "raw", "Expr", "RNASEQ", "TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz", ), "ZmSAM2", "Maize Root Network, but loose", Zm5bFGS, rawtype="RNASEQ", max_gene_missing_data=0.4, min_single_sample_expr=1, min_expr=0.01, quantile=False, dry_run=False, max_val=250, ) else: return co.COB("ZmSAM2")
def locality(args): log = coblog() log("\n" "-----------------------\n" " Network Locality \n" "-----------------------\n") # Generate output dirs if args.out != sys.stdout: args.out = "{}_Locality.tsv".format(args.out.replace(".tsv", "")) if os.path.dirname(args.out) != "": os.makedirs(os.path.dirname(args.out), exist_ok=True) if os.path.exists("{}_Locality.tsv".format(args.out.replace(".tsv", ""))): log("{}_Locality.csv exists! Skipping!".format( args.out.replace(".tsv", ""))) return None # Grab the COB object cob = co.COB(args.cob) gwas = co.GWAS(args.gwas) # If there is a different score for 'significant', update the COB object if args.sig_edge_zscore is not None: cob.set_sig_edge_zscore(args.sig_edge_zscore) # If all, grab a generater if "all" in args.terms: terms = gwas.iter_terms() else: # Otherwise get the term out of the GWAS terms = (gwas[x] for x in args.terms) # Add in text for axes locality = pd.DataFrame([generate_data(cob, x, args) for x in terms]) locality.to_csv(args.out, sep="\t", index=None)
def AtSeed(AtTair10): if cf.test.force.COB: tools.del_dataset('Expr', 'AtSeed', force=True) if not tools.available_datasets('Expr', 'AtSeed'): Seed = [ 'GSE12404', #'GSE30223', 'GSE1051', 'GSE11852', 'GSE5634' ] SeedFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Seed ]) #SeedFam.to_keepfile("SeedKeep.tsv", keep_hint='seed') return co.COB.from_DataFrame( SeedFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'SeedKeep.tsv')), 'AtSeed', 'Arabidopsis Seed', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtSeed')
def AtGen(AtTair10): if cf.test.force.COB: tools.del_dataset('Expr', 'AtGen', force=True) if not tools.available_datasets('Expr', 'AtGen'): General = [ 'GSE18975', 'GSE39384', 'GSE19271', 'GSE5632', 'GSE39385', 'GSE5630', 'GSE15617', 'GSE5617', 'GSE5686', 'GSE2473', 'GSE5633', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] GenFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in General ]) #GenFam.to_keepfile("GenKeep.tsv") return co.COB.from_DataFrame( GenFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'GenKeep.tsv')), 'AtGen', 'Arab General', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtGen')
def AtRoot(AtTair10): if cf.test.force.COB: tools.del_dataset('Expr', 'AtRoot', force=True) if not tools.available_datasets('Expr', 'AtRoot'): Root = [ 'GSE14578', 'GSE46205', 'GSE7631', 'GSE10576', 'GSE42007', 'GSE34130', 'GSE21611', 'GSE22966', 'GSE7641', 'GSE5620', 'GSE8934', 'GSE5628', 'GSE30095', 'GSE30097', 'GSE5624', 'GSE5626', 'GSE5749', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] RootFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Root ]) #RootFam.to_keepfile("RootKeep.tsv", keep_hint='root') return co.COB.from_DataFrame( RootFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'RootKeep.tsv')), 'AtRoot', 'Arab Root', AtTair10, rawtype='MICROARRAY', quantile=True) else: return co.COB('AtRoot')
def cistrans(args): cob = co.COB(args.cob) if args.out == None: args.out = "{}_cistrans".format(cob.name) # open an output file out = open(args.out + ".summary.txt", "w") # np.newaxis adds an empty axis in that position of the slice # the sklearn module requires the values to be in the rows: # http://scikit-learn.org/stable/auto_examples/neighbors/plot_kde_1d.html coex = cob._coex_DataFrame(sig_only=False) cis = coex.score[coex.distance <= args.cis_distance].values[:, np.newaxis] trans = coex.score[np.isinf(coex.distance)].values[:, np.newaxis] X_plot = np.linspace(-10, 10, 1000)[:, np.newaxis] str = "Found {:,} cis interactions and {:,} trans interactions".format( cis.shape[0], trans.shape[0]) print(str) print(str, file=out) # Fit the kernel kd = KernelDensity(bandwidth=0.2) kd.fit(cis) cis_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot, cis_kde, alpha=0.5, label="Cis Interactions") # Fit the trans kd.fit(trans[0:50000]) trans_kde = np.exp(kd.score_samples(X_plot)) plt.fill(X_plot, trans_kde, alpha=0.5, label="Trans Interactions") plt.legend() plt.title("Cis vs Trans Density: {}".format(cob.name)) # Calculate the mann whitney U test u, pval = sp.stats.mannwhitneyu(cis[:, 0], trans[:, 0]) print("P-val: {}".format(pval)) print("P-val: {}".format(pval), file=out) print("Figure saved: {}".format(args.out + ".png")) plt.savefig(args.out + ".png")
def ZmRNASeqTissueAtlas(Zm5bFGS): if cf.test.force.COB: print('Rebuilding ZmRNASeqTissueAtlas') tools.del_dataset('COB', 'ZmRNASeqTissueAtlas', force=True) tools.del_dataset('Expr', 'ZmRNASeqTissueAtlas', force=True) if not tools.available_datasets('Expr', 'ZmRNASeqTissueAtlas'): # Build it return co.COB.from_table( os.path.join( cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'MaizeRNASeqTissue.tsv.bz2', ), 'ZmRNASeqTissueAtlas', 'Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300, dry_run=True) else: return co.COB('ZmRNASeqTissueAtlas')
def main(): # Load both the co-expression networks into memory KLS = co.COB("KLS") KSS = co.COB("KSS") # Get the genes from each co-expression network and create an intersection # (only keep a list of genes in both networks) common_genes = np.intersect1d(KLS._expr.index.values, KSS._expr.index.values) # Calculate the length of the genes they have in common length = len(common_genes) # allocate array for pearson correlation coefficients pcc = np.zeros(length) # iterate over every gene for i in range(length): # reset scores for kls and kss kls_score = np.zeros(length) kss_score = np.zeros(length) # fill these arrays with score values by querying the co-expression network database for j in range(length): # skip same index if i == j: continue # get co-expression scores from the networks kls_score[j] = KLS.coexpression_base(common_genes[i], common_genes[j])["score"] kss_score[j] = KSS.coexpression_base(common_genes[i], common_genes[j])["score"] # calculate the pearson correlation coefficient (pcc[i], p_value) = st.pearsonr(kls_score, kss_score) # print the values in a tab separated manner msg = "%s\t%s\t%s" % (i, common_genes[i], pcc[i]) print(msg) print(msg, file=sys.stderr)
def SNP2Gene_breakdown(self,COB=None): ''' Provides a breakdown of SNP to gene mapping parameters for each term in the Overlap. Includes the number of initial Loci, the number of collapsed Loci (within a window) and the number of candidate genes (within a window and up to a flank limit) Parameters ---------- COB : str (default: 'average') If specfified, the results will be composed only of SNP to gene mappings from a single COB network. If 'average' is specified, the results will be the SET of genes across all COB networks. ''' # Get some help def bp_to_kb(bp): return "{}KB".format(int(bp/1000)) def get_level(df,level): ''' Returns the level values by name ''' level_index = df.columns.names.index(level) return df.columns.levels[level_index] # Prepare the data frame results if COB == None: results = self.results else: results = self.results.query('COB=="{}"'.format(COB)) # Total for the Ionome ont = co.GWAS(self.results.Ontology.unique()[0]) ref = co.COB(self.results.COB.unique()[0])._parent_refgen # Make an aggregate term total = co.Term('total',loci=set(chain(* [x.loci for x in ont.terms()]))) # Calculate number of SNPs snps = pd.DataFrame(pd.pivot_table(results,index="Term",values='TermLoci')) snps.columns = pd.MultiIndex.from_product([['GWAS SNPs'],['-'],['-']],names=['Name','WindowSize','FlankLimit']) snps.ix['Total'] = len(total.loci) # Calculate number of Candidate Loci loci = pd.pivot_table(results,index="Term",columns=['WindowSize'],values='TermCollapsedLoci') for window_size in loci.columns: loci.ix['Total',window_size] = len(total.effective_loci(window_size)) loci.columns = pd.MultiIndex.from_product([['Collapsed Loci'],list(map(bp_to_kb,loci.columns)),['-']],names=['Name','WindowSize','FlankLimit']) # Calculate number of Candidate Genes genes = pd.pivot_table(results,index='Term',columns=['WindowSize','FlankLimit'],values='gene',aggfunc=lambda x: len(set(x))) for window_size in get_level(genes,'WindowSize'): for flank_limit in get_level(genes,'FlankLimit'): genes.ix['Total',(window_size,flank_limit)] = len(ref.candidate_genes(total.effective_loci(window_size=window_size),flank_limit=flank_limit)) genes.columns = pd.MultiIndex.from_product( [['Candidate Genes'], list(map(bp_to_kb,get_level(genes,"WindowSize"))), get_level(genes,'FlankLimit') ], names=['Name','WindowSize','FlankLimit'] ) results = snps.join(loci).join(genes) #ionome_eff_loci = [len()] return results.astype(int)
def plot_local_vs_cc(term, filename=None, bootstraps=1): RZM = co.COB('ROOT').refgen # use root specific for bootstraps pylab.clf() for _ in range(0, bootstraps): graph = co.COB('ROOT').graph(term.bootstrap_flanking_genes(RZM)) degree = np.array(graph.degree()) cc = np.array(graph.transitivity_local_undirected(weights='weight')) nan_mask = np.isnan(cc) pylab.scatter(degree[~nan_mask], cc[~nan_mask], alpha=0.05) # plot empirical graph = COB('ROOT').graph(term.flanking_genes(RZM)) degree = np.array(graph.degree()) cc = np.array(graph.transitivity_local_undirected(weights='weight')) nan_mask = np.isnan(cc) pylab.scatter(degree[~nan_mask], cc[~nan_mask]) pylab.xlabel('Local Degree') pylab.ylabel('Clustering Coefficient') if filename is None: filename = "{}_cc.png".format(term.id) pylab.savefig(filename)
def main(): (options, args) = parser.parse_args() if not options.network_name: parser.error("Must specify name of network") network = co.COB(options.network_name) degree = network.degree i = 0 for index, row in degree.iterrows(): msg = "%s\t%s\t%s" % (i, index, row["Degree"]) print(msg) i += 1
def plot_local_global_degree(term, filename=None, bootstraps=1): ROOT = co.COB("ROOT") RZM = ROOT.refgen # use root specific for bootstraps hood = ROOT.neighborhood(term.flanking_genes(RZM)) bshood = pd.concat([ROOT.neighborhood(term.bootstrap_flanking_genes(RZM)) for _ in range(0, bootstraps)]) pylab.clf() pylab.scatter(bshood['local'], bshood['global'], alpha=0.05) pylab.scatter(hood['local'], hood['global'], c='r') pylab.xlabel('Local Degree') pylab.ylabel('Global Degree') pylab.title('{} Locality'.format(term.id)) if filename is None: filename = "{}_locality.png".format(term.id) pylab.savefig(filename)
def ZmSAM(Zm5bFGS): if cf.test.force.COB: tools.del_dataset('Expr', 'ZmSAM', force=True) if not tools.available_datasets('Expr', 'ZmSAM'): return co.COB.from_table(os.path.join( cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt.gz'), 'ZmSAM', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=0.1, quantile=False, dry_run=False, max_val=250) else: return co.COB('ZmSAM')
def ZmRoot(Zm5bFGS): if cf.test.force.COB: tools.del_dataset('Expr', 'ZmRoot', force=True) if not tools.available_datasets('Expr', 'ZmRoot'): return co.COB.from_table(os.path.join(cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'ROOTFPKM.tsv.gz'), 'ZmRoot', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300) else: return co.COB('ZmRoot')
def ZmPAN(Zm5bFGS): if cf.test.force.COB: tools.del_dataset('Expr', 'ZmPAN', force=True) if not tools.available_datasets('Expr', 'ZmPAN'): return co.COB.from_table(os.path.join(cf.options.testdir, 'raw', 'Expr', 'RNASEQ', 'PANGenomeFPKM.txt.gz'), 'ZmPAN', 'Maize Root Network', Zm5bFGS, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=1, quantile=False, dry_run=False, sep=',', max_val=300) else: return co.COB('ZmPAN')
def ZmRoot(Zm5bFGS): if cf.test.force.COB: tools.del_dataset("Expr", "ZmRoot", force=True) if not tools.available_datasets("Expr", "ZmRoot"): return co.COB.from_table( os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ", "ROOTFPKM.tsv.gz"), "ZmRoot", "Maize Root Network", Zm5bFGS, rawtype="RNASEQ", max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300, ) else: return co.COB("ZmRoot")
def ZmPAN(Zm5bFGS): if cf.test.force.COB: tools.del_dataset("Expr", "ZmPAN", force=True) if not tools.available_datasets("Expr", "ZmPAN"): return co.COB.from_table( os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ", "PANGenomeFPKM.txt.gz"), "ZmPAN", "Maize Root Network", Zm5bFGS, rawtype="RNASEQ", max_gene_missing_data=0.4, min_expr=1, quantile=False, dry_run=False, sep=",", max_val=300, ) else: return co.COB("ZmPAN")
def AtLeaf(AtTair10): if cf.test.force.COB: tools.del_dataset('Expr', 'AtLeaf', force=True) if not tools.available_datasets('Expr', 'AtLeaf'): Leaf = [ 'GSE14578', 'GSE5630', 'GSE13739', #'GSE26199', 'GSE5686', 'GSE5615', 'GSE5620', 'GSE5628', 'GSE5624', 'GSE5626', 'GSE5621', 'GSE5622', 'GSE5623', 'GSE5625', 'GSE5688' ] LeafFam = sum([ co.Family.from_file( os.path.join(cf.options.testdir, 'raw', 'GSE', '{}_family.soft.gz'.format(x))) for x in Leaf ]) #LeafFam.to_keepfile("LeafKeep.tsv", keep_hint="lea") return co.COB.from_DataFrame( LeafFam.series_matrix(keepfile=os.path.join( cf.options.testdir, 'raw', 'GSE', 'LeafKeep.tsv')), 'AtLeaf', 'Arabidopsis Leaf', AtTair10, rawtype='MICROARRAY', max_gene_missing_data=0.3, min_expr=0.01, quantile=True, ) else: return co.COB('AtLeaf')
def ZmRNASeqTissueAtlas(Zm5bFGS): if cf.test.force.COB: print("Rebuilding ZmRNASeqTissueAtlas") tools.del_dataset("COB", "ZmRNASeqTissueAtlas", force=True) tools.del_dataset("Expr", "ZmRNASeqTissueAtlas", force=True) if not tools.available_datasets("Expr", "ZmRNASeqTissueAtlas"): # Build it return co.COB.from_table( os.path.join(cf.options.testdir, "raw", "Expr", "RNASEQ", "MaizeRNASeqTissue.tsv.bz2"), "ZmRNASeqTissueAtlas", "Maize RNASeq Tissue Atlas Network, Sekhon 2013, PLoS ONE", Zm5bFGS, rawtype="RNASEQ", max_gene_missing_data=0.3, max_accession_missing_data=0.08, min_single_sample_expr=1, min_expr=0.001, quantile=False, max_val=300, dry_run=False, ) else: return co.COB("ZmRNASeqTissueAtlas")
def from_CLI(cls, args): """ Implements an interface for the CLI to perform overlap Analysis """ if args.genes != [None]: source = "genes" elif args.go is not None: source = "go" elif args.gwas is not None: source = "gwas" elif args.ontology is not None: source = 'ontology' self = cls.create(source+'_CLI', description="CLI Overlap") self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == "genes": # Be smart about this import re args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = "GeneList" args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "go": self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "gwas": self.ont = co.GWAS(args.gwas) elif source == 'ontology': self.ont = co.Ontology(args.ontology) else: raise ValueError( "Please provide a valid overlap source (--genes, --go or --gwas)" ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if "strongest" in self.args.snp2gene: if not (self.ont._global("strongest_attr") == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not ( bool(int(self.ont._global("strongest_higher"))) == bool(args.strongest_higher) ): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == "genes": # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log("Some input genes not in network") terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)] else: # Generate terms from the ontology if "all" in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log( " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms ) if term.id in self.args.skip_terms: self.cob.log("Skipping {} since it was in --skip-terms", term.id) self.cob.log("Generating SNP-to-gene mapping") # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log("Not enough genes to perform overlap") continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log("Too many genes to perform overlap") continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci), ) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log("Generating bootstraps") bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density self.cob.log("Calculating Z-Scores") if bs_std != 0: overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap["zscore"] = bootstraps["zscore"] = 0 # Calculate FDR self.cob.log("Calculating FDR") overlap["fdr"] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = ( bootstraps.groupby("iter") .apply(lambda df: sum(df.zscore >= zscore)) .mean() ) num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std overlap.sort_values(by=["zscore"], ascending=False, inplace=True) overlap_pval = ( sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean() ) ) / len(bootstraps.iter.unique()) # This gets collated into all_results below overlap["COB"] = self.cob.name overlap["Ontology"] = self.ont.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["TermLoci"] = len(term.loci) overlap["TermCollapsedLoci"] = len(loci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method overlap["SNP2Gene"] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == "density": overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean()) ) elif self.args.method == "locality": overlap_score = np.nanmean(overlap.score) self.cob.log( "Overlap Score ({}): {} (p<{})".format( self.args.method, overlap_score, overlap_pval ) ) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep="\t", index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql( "overlap", sqlite3.connect(overlap_object.db.filename), if_exists="append", index=False, )
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' if args.genes != [None]: source = 'genes' elif args.go is not None: source = 'go' elif args.gwas is not None: source = 'gwas' self = cls.create(source, description='CLI Overlap') self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == 'genes': # Be smart about this import re args.genes = list( chain(*[re.split('[,; ]', x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = 'GeneList' args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'go': self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'gwas': self.ont = co.GWAS(args.gwas) else: raise ValueError( 'Please provide a valid overlap source (--genes, --go or --gwas)' ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if 'strongest' in self.args.snp2gene: if not (self.ont._global('strongest_attr') == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not (bool(int(self.ont._global('strongest_higher'))) == bool( args.strongest_higher)): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == 'genes': # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log('Some input genes not in network') terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)] else: # Generate terms from the ontology if 'all' in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log(' ---------- Calculating overlap for {} of {} Terms', i, num_total_terms) if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) self.cob.log('Generating SNP-to-gene mapping') # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log('Generating bootstraps') bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density self.cob.log('Calculating Z-Scores') if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR self.cob.log('Calculating FDR') overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == 'density': overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean())) elif self.args.method == 'locality': overlap_score = np.nanmean(overlap.score) self.cob.log('Overlap Score ({}): {} (p<{})'.format( self.args.method, overlap_score, overlap_pval)) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' self = cls.create(args.gwas, description='CLI Overlap') # Build base camoco objects self.args = args self.cob = co.COB(args.cob) if args.go: self.ont = co.GOnt(args.gwas) args.candidate_window_size = 1 args.candidate_flank_limit = 0 else: self.ont = co.GWAS(args.gwas) self.generate_output_name() # Generate a terms iterable if 'all' in self.args.terms: terms = self.ont.iter_terms() else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] # Iterate through terms and calculate for term in terms: if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) # Generate SNP2Gene Loci loci = self.snp2gene(term) if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) if not args.dry_run: self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) overlap_object = cls.create(self.ont) overlap_object.results = results self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.Tools.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.Tools.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
bundle.write(fd.read()) bundle.write('\n') # Actually bundle them bundle_files(js_files, 'js') bundle_files(css_files, 'css') # ---------------------------------------- # Load things to memeory to prepare # ---------------------------------------- # Generate network list based on allowed list print('Preloading networks into memory...') if len(conf['networks']) < 1: conf['networks'] = list(co.Tools.available_datasets('Expr')['Name'].values) networks = {x: co.COB(x) for x in conf['networks']} network_info = [] refLinks = {} for name, net in networks.items(): network_info.append({ 'name': net.name, 'refgen': net._global('parent_refgen'), 'desc': net.description, }) if net._global('parent_refgen') in conf['refLinks']: refLinks[net.name] = conf['refLinks'][net._global('parent_refgen')] print('Availible Networks: ' + str(networks)) # Generate ontology list based on allowed list and load them into memory print('Preloading GWASes into Memory...')
def geneneighbors(args): args.out = os.path.splitext(args.out)[0] + "_neighbors.txt" if os.path.dirname(args.out) != "": os.makedirs(os.path.dirname(args.out), exist_ok=True) if os.path.exists(args.out): print("Output for {} exists! Skipping!".format(args.out), file=sys.stderr) return cob = co.COB(args.cob) cob.set_sig_edge_zscore(int(args.zscore)) genes = cob.refgen.iter_genes() print("Generating neighbors for {} ".format(cob.name), file=sys.stderr) # make empty list to store our results GENE = [] NumNeighbors = [] results = [] # iterate through each gene in the network for i in genes: # get the list of neighbors NB = cob.neighbors(i) # pandas is weird so we need to get gene names like this NB = NB.reset_index() x = set(NB.gene_a) x = x.union(NB.gene_b) # sometimes it lists itself as a neighbor so remove if i.id in x: x.remove(i.id) else: continue # store how many neighbors a gene has NumNeighbors.append(str(len(x))) GeneNeighbors = [] GENE.append(i.id) SCORE = [] DIST = [] JGENE = [] # for each of the gene neighbors for j in x: # get the ID and see the co-expression results # between the neighbor and original gene gene2 = cob.refgen.from_ids(j) y = cob.coexpression(i, gene2) score = y[0] significant = y[1] distance = y[2] # store all of the information JGENE.append(j) SCORE.append(score) DIST.append(distance) # zip those results so we can sort it together ZIPPED = zip(JGENE, SCORE, DIST) # sort by the Z-score SZIPPED = sorted(ZIPPED, key=lambda x: x[1], reverse=True) # grab the top 10 genes TOP10 = SZIPPED[0 : int(args.numneighbors)] # unzip them so we can combine them for writing x, y, z = zip(*TOP10) # combin the lists and add to results for l, m, n in zip(x, y, z): Temp = (str(l), str(m), str(n)) NeighborInfo = ",".join(Temp) GeneNeighbors.append(NeighborInfo) GeneNeighbors = "\t".join(GeneNeighbors) # print(GeneNeighbors) results.append(GeneNeighbors) # write to a file output = open(args.out, "w") output.write( "Gene" + "\t" + "Number of Neighbors" + "\t" + "Gene,ZScore,Significant,distance" + "\n" ) for a, b, c in zip(GENE, NumNeighbors, results): final = (a, b, c) output.write("\t".join(final)) output.write("\n")
def cob_health(args): log = coblog() log('\n' '-----------------------\n' ' Network Health \n' '-----------------------\n') cob = co.COB(args.cob) if args.out is None: args.out = '{}_Health'.format(cob.name) log('Plotting Scores ----------------------------------------------------') if not path.exists('{}_CoexPCC_raw.png'.format(args.out)): cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True) else: log('Skipped Raw.') if not path.exists('{}_CoexScore_zscore.png'.format(args.out)): cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False) else: log('Skipped Norm.') log('Plotting Expression ------------------------------------------------') if not path.exists('{}_Expr_raw.png'.format(args.out)): cob.plot('{}_Expr_raw.png'.format(args.out), include_accession_labels=True, raw=True, cluster_method=None) else: log('Skipped raw.') if not path.exists('{}_Expr_norm.png'.format(args.out)): cob.plot('{}_Expr_norm.png'.format(args.out), include_accession_labels=True, raw=False, cluster_method='leaf', cluster_accessions=True) else: log('Skipped norm.') log('Plotting Cluster Expression-----------------------------------------') if not path.exists('{}_Expr_cluster.png'.format(args.out)): cob.plot('{}_Expr_cluster.png'.format(args.out), include_accession_labels=True, raw=False, cluster_accessions=True, avg_by_cluster=True) else: log('Skipped norm.') log('Printing Summary ---------------------------------------------------') if not path.exists('{}.summary.txt'.format(args.out)): with open('{}.summary.txt'.format(args.out), 'w') as OUT: # Print out the network summary cob.summary(file=OUT) else: log('Skipped summary.') log('Printing QC Statistics ---------------------------------------------') if args.refgen is not None: if not path.exists('{}_qc_gene.txt'.format(args.out)): # Print out the breakdown of QC Values refgen = co.RefGen(args.refgen) gene_qc = cob._bcolz('qc_gene') gene_qc = gene_qc[gene_qc.pass_membership] gene_qc['chrom'] = [ 'chr' + str(refgen[x].chrom) for x in gene_qc.index ] gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0) # Add totals at the bottom totals = gene_qc.ix[:, slice(1, None)].apply(sum) totals.name = 'TOTAL' gene_qc = gene_qc.append(totals) gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t') else: log('Skipped QC summary.') #if not path.exists('{}_CisTrans.png'.format(args.out)): # Get trans edges log('Plotting Degree Distribution ---------------------------------------') if not path.exists('{}_DegreeDist.png'.format(args.out)): degree = cob.degree['Degree'].values #Using powerlaw makes run-time warning the first time you use it. #This is still an open issue on the creators github. #The creator recommends removing this warning as long as there is a fit. np.seterr(divide='ignore', invalid='ignore') fit = powerlaw.Fit(degree, discrete=True, xmin=1) # get an axis ax = plt.subplot() # Calculate log ratios t2p = fit.distribution_compare('truncated_power_law', 'power_law') t2e = fit.distribution_compare('truncated_power_law', 'exponential') p2e = fit.distribution_compare('power_law', 'exponential') # Plot! emp = fit.plot_ccdf(ax=ax, color='r', linewidth=3, label='Empirical Data') pwr = fit.power_law.plot_ccdf(ax=ax, color='b', linestyle='--', label='Power law') tpw = fit.truncated_power_law.plot_ccdf(ax=ax, color='k', linestyle='--', label='Truncated Power') exp = fit.exponential.plot_ccdf(ax=ax, color='g', linestyle='--', label='Exponential') #### ax.set_ylabel("p(Degree≥x)") ax.set_xlabel("Degree Frequency") ax.legend(loc='best') plt.title('{} Degree Distribution'.format(cob.name)) # Save Fig try: plt.savefig('{}_DegreeDist.png'.format(args.out)) except FutureWarning as e: # This is a matplotlib bug pass else: log('Skipping Degree Dist.') log('Plotting GO --------------------------------------------------------') if args.go is not None: if not path.exists('{}_GO.csv'.format(args.out)): go = co.GOnt(args.go) term_ids = [] density_emp = [] density_pvals = [] locality_emp = [] locality_pvals = [] term_sizes = [] term_desc = [] terms_tested = 0 if args.max_terms is not None: log('Limiting to {} GO Terms', args.max_terms) terms = go.rand(n=args.max_terms, min_term_size=args.min_term_size, max_term_size=args.max_term_size) else: terms = go.iter_terms(min_term_size=args.min_term_size, max_term_size=args.max_term_size) for term in terms: term.loci = list(filter(lambda x: x in cob, term.loci)) if len(term) < args.min_term_size or len( term) > args.max_term_size: continue #set density value for two tailed go so we only test it once density = cob.density(term.loci) #one tailed vs two tailed test if args.two_tailed_GO is False: #run one tail for only positive values if density > 0: density_emp.append(density) #skip negative density values else: continue #if two_tailed_go is not none else: density_emp.append(density) term_ids.append(term.id) term_sizes.append(len(term)) term_desc.append(str(term.desc)) # ------ Density # Calculate PVals density_bs = np.array([ cob.density(cob.refgen.random_genes(n=len(term.loci))) \ for x in range(args.num_bootstraps) ]) if density > 0: pval = sum(density_bs >= density) / args.num_bootstraps else: pval = sum(density_bs <= density) / args.num_bootstraps density_pvals.append(pval) # ------- Locality locality = cob.locality(term.loci, include_regression=True).resid.mean() locality_emp.append(locality) # Calculate PVals locality_bs = np.array([ cob.locality( cob.refgen.random_genes(n=len(term.loci)), include_regression=True ).resid.mean() \ for x in range(args.num_bootstraps) ]) if locality > 0: pval = sum(locality_bs >= locality) / args.num_bootstraps else: pval = sum(locality_bs <= locality) / args.num_bootstraps locality_pvals.append(pval) # ------------- terms_tested += 1 if terms_tested % 100 == 0 and terms_tested > 0: log('Processed {} terms'.format(terms_tested)) go_enrichment = pd.DataFrame({ 'GOTerm': term_ids, 'desc': term_desc, 'size': term_sizes, 'density': density_emp, 'density_pval': density_pvals, 'locality': locality_emp, 'locality_pval': locality_pvals }) go_enrichment\ .sort_values(by='density_pval',ascending=True)\ .to_csv('{}_GO.csv'.format(args.out),index=False) if terms_tested == 0: log.warn('No GO terms met your min/max gene criteria!') else: go_enrichment = pd.read_table('{}_GO.csv'.format(args.out), sep=',') if not path.exists('{}_GO.png'.format(args.out)): # Convert pvals to log10 with np.errstate(divide='ignore'): # When no bootstraps are more extreme than the term, the minus log pval yields an infinite go_enrichment['density_pval'] = -1 * np.log10( go_enrichment['density_pval']) go_enrichment['locality_pval'] = -1 * np.log10( go_enrichment['locality_pval']) # Fix the infinites so they are plotted max_density = np.max(go_enrichment['density_pval'][np.isfinite( go_enrichment['density_pval'])]) max_locality = np.max( go_enrichment['locality_pval'][np.isfinite( go_enrichment['locality_pval'])]) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment['density_pval'])), 'density_pval'] = max_density + 1 go_enrichment.loc[np.logical_not( np.isfinite(go_enrichment['locality_pval'])), 'locality_pval'] = max_locality + 1 plt.clf() figure, axes = plt.subplots(3, 2, figsize=(12, 12)) # ----------- # Density # ---------- axes[0, 0].scatter(go_enrichment['density'], go_enrichment['density_pval'], alpha=0.05) axes[0, 0].set_xlabel('Empirical Density (Z-Score)') axes[0, 0].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 0].text(min(axes[0, 0].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 0].scatter(go_enrichment['size'], go_enrichment['density_pval'], alpha=0.05) axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 0].set_xlabel('Term Size') axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 0].scatter(go_enrichment['size'], go_enrichment['density'], alpha=0.05) axes[2, 0].scatter(go_enrichment.query('density_pval>1.3')['size'], go_enrichment.query('density_pval>1.3')['density'], alpha=0.05, color='r') axes[2, 0].set_ylabel('Density') axes[2, 0].set_xlabel('Term Size') # ------------ # Do Locality # ------------ axes[0, 1].scatter(go_enrichment['locality'], go_enrichment['locality_pval'], alpha=0.05) axes[0, 1].set_xlabel('Empirical Locality (Residual)') axes[0, 1].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 1].text(min(axes[0, 1].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 1].scatter(go_enrichment['size'], go_enrichment['locality_pval'], alpha=0.05) axes[1, 1].set_xlabel('Term Size') axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 1].scatter(go_enrichment['size'], go_enrichment['locality'], alpha=0.05) axes[2, 1].scatter( go_enrichment.query('locality_pval>1.3')['size'], go_enrichment.query('locality_pval>1.3')['locality'], alpha=0.05, color='r') axes[2, 1].set_ylabel('Density') axes[2, 1].set_xlabel('Term Size') # Save Figure plt.tight_layout() try: plt.savefig('{}_GO.png'.format(args.out)) except FutureWarning as e: pass else: log('Skipping GO Volcano.')
bundle.write(fd.read()) bundle.write("\n") # Actually bundle them bundle_files(js_files, "js") bundle_files(css_files, "css") # ---------------------------------------- # Load things to memeory to prepare # ---------------------------------------- # Generate network list based on allowed list print("Preloading networks into memory...") if len(conf["networks"]) < 1: conf["networks"] = list(co.Tools.available_datasets("Expr")["Name"].values) networks = {x: co.COB(x) for x in conf["networks"]} network_info = [] refLinks = {} for name, net in networks.items(): network_info.append( { "name": net.name, "refgen": net._global("parent_refgen"), "desc": net.description, } ) if net._global("parent_refgen") in conf["refLinks"]: refLinks[net.name] = conf["refLinks"][net._global("parent_refgen")] print("Availible Networks: " + str(networks))
usage() sys.exit(2) for opt, arg in opts: if opt in ("-c", "--cob"): cobname = arg elif opt in ("-s", "--secondnetwork"): cob2name = arg elif opt in ("-h", "--help"): usage() sys.exit(2) else: assert False, "unhandled option" # load the network object cob = co.COB(cobname) cob_compare = co.COB(cob2name) cob.set_sig_edge_zscore(2.5) cob_compare.set_sig_edge_zscore(2.5) # change from np.ndarray to pd dataframe Clusters = pd.DataFrame(cob.clusters) # Make a ordered dictionary to each key # is a cluster and each value is the # genes in that cluster ClustDict = collections.OrderedDict() for index, row in Clusters.iterrows(): if row[0] in ClustDict.keys(): ClustDict[row[0]].append(index)
def plot_gwas(args): # snag the appropriate COB cob = co.COB(args.cob) # snag the GWAS object gwas = co.GWAS(args.gwas) if 'all' in args.terms: terms = gwas.iter_terms() else: terms = [gwas[term] for term in args.terms] # Make a plot for each Term for term in terms: loci = list(term.loci) # create a dictionary of Loci which we can refer to using ids locus_lookup = {x.id: x for x in loci} # Each chromosome gets a plot chroms = set([x.chrom for x in loci]) # Create a figure with a subplot for each chromosome f, axes = plt.subplots(len(chroms), figsize=(15, 4 * len(chroms))) plt.title('{} Term'.format(term.id)) # Pull out the snp to gene mappings if args.snp2gene == 'effective': loci = sorted( term.effective_loci(window_size=args.candidate_window_size)) elif args.snp2gene == 'strongest': loci = term.strongest_loci(window_size=args.candidate_window_size, attr=args.strongest_attr, lowest=args.strongest_higher) else: raise ValueError('{} not valid snp2gene mapping'.format( args.snp2gene)) # iterate over Loci seen_chroms = set() voffset = 1 # Vertical Offset current_axis = 0 y_labels = [] y_ticks = [] for i, locus in enumerate(loci): hoffset = -1 * locus.window # Reset the temp variables in necessary if locus.chrom not in seen_chroms: seen_chroms.add(locus.chrom) current_axis = len(seen_chroms) - 1 voffset = 1 if len(y_labels) > 0 and current_axis > 0: # Set the old labels in the current axes[current_axis - 1].set_yticks(y_ticks) axes[current_axis - 1].set_yticklabels(y_labels) y_labels = [] y_ticks = [] # Get current axis cax = axes[current_axis] # Set up labels if first time one axis if voffset == 1: cax.set_ylabel('Chrom: ' + locus.chrom) cax.set_xlabel('Loci') # shortcut for current axis cax.hold(True) # Plot ALL Genes for gene in gwas.refgen.candidate_genes(locus, flank_limit=10e10): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='RefGen Genes', color='grey') # Plot the candidate genes for gene in cob.refgen.candidate_genes(locus, flank_limit=10e10): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='Gene Passed QC', color='green') # Plot the candidate genes for gene in cob.refgen.candidate_genes( locus, flank_limit=args.candidate_flank_limit): cax.barh(bottom=voffset, width=len(gene), height=5, zorder=1, left=hoffset + gene.start - locus.start + locus.window, label='Candidate Gene', color='red') # Plot the Effective Locus cax.scatter( # Upstream hoffset, voffset, marker='>', zorder=2) cax.scatter( # Start hoffset + locus.window, voffset, marker='.', color='blue', zorder=2) cax.scatter( # Stop hoffset + locus.window + len(locus), voffset, marker='.', color='blue', zorder=2) cax.scatter( # Downstream hoffset + locus.window + len(locus) + locus.window, voffset, marker='<', zorder=2) # Plot the Sub Loci for id in locus.sub_loci: if id in locus_lookup: sub_locus = locus_lookup[id] cax.scatter(hoffset + locus.window + abs(sub_locus.start - locus.start), voffset, zorder=2, marker='.', label='SNP', color='blue') # place a block for interlocal distance y_labels.append(commify(locus.start)) y_ticks.append(voffset) voffset += 10 # Have to finish off the ticks on the last chromosome axes[current_axis].set_yticks(y_ticks) axes[current_axis].set_yticklabels(y_labels) # Save Plot plt.savefig(args.out.replace('.png', '_{}.png'.format(term.id))) plt.close()
def from_CLI(cls, args): """ Implements an interface to the CLI to perform GWAS simulation """ self = cls() # Build the base objects self.args = args # Load camoco objects self.go = co.GOnt(self.args.GOnt) self.cob = co.COB(self.args.cob) self.generate_output_name() # Generate an iterable of GO Terms if "all" in self.args.terms: # Create a list of all terms within the size specification terms = list( self.go.iter_terms( min_term_size=self.args.min_term_size, max_term_size=self.args.max_term_size, )) elif os.path.exists(self.args.terms[0]): # If parameter is a filename, read term name from a filenamie terms = list( [self.go[x.strip()] for x in open(args.terms[0]).readlines()]) else: # Generate terms from a parameter list terms = list([self.go[x] for x in self.args.terms]) # Iterate and calculate log("Simulating GWAS for {} GO Terms", len(terms)) min_term_size = np.min([len(x) for x in terms]) max_term_size = np.max([len(x) for x in terms]) log("All terms are between {} and {} 'SNPs'", min_term_size, max_term_size) results = [] for i, term in enumerate(terms): log("-" * 75) window_size = self.args.candidate_window_size flank_limit = self.args.candidate_flank_limit # Generate a series of densities for parameters num_genes = len([x for x in term.loci if x in self.cob]) eloci = [ x for x in term.effective_loci(window_size=window_size) if x in self.cob ] eloci = self.simulate_missing_candidates(eloci, self.args.percent_mcr) eloci = self.simulate_false_candidates(eloci, self.args.percent_fcr) log( "GWAS Simulation {}: {} ({}/{} genes in {})", i, term.id, len(eloci), num_genes, self.cob.name, ) # Make sure that the number of genes is adequate if num_genes > self.args.max_term_size: log("Too many genes... skipping") continue elif num_genes < self.args.min_term_size: log("Too few genes... skipping") continue elif num_genes == 0: continue # Generate candidate genes from the effecive loci candidates = self.cob.refgen.candidate_genes( eloci, flank_limit=flank_limit) log( "SNP to gene mapping finds {} genes at window:{} bp, " "flanking:{} genes", len(candidates), self.args.candidate_window_size, self.args.candidate_flank_limit, ) overlap = self.overlap(eloci) # Dont bother bootstrapping on terms with overlap score below 0 if overlap.score.mean() < 0: continue bootstraps = self.generate_bootstraps(eloci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std overlap_pval = (sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean())) / len(bootstraps.iter.unique()) # Create a results object overlap["COB"] = self.cob.name overlap["Ontology"] = self.go.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["FCR"] = args.percent_fcr overlap["MCR"] = args.percent_mcr overlap["NumRealGenes"] = num_genes overlap["NumEffective"] = len(eloci) overlap["NumCandidates"] = len(candidates) overlap["TermSize"] = len(term) overlap["TermCollapsedLoci"] = len(eloci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method results.append(overlap.reset_index()) self.results = pd.concat(results) self.results.to_csv(args.out, sep="\t", index=False)