def testZmGO(Zm5bFGS): if cf.test.force.Ontology: tools.del_dataset("GOnt", "ZmGO", force=True) if not tools.available_datasets("GOnt", "ZmGO"): obo = os.path.join(cf.options.testdir, "raw", "GOnt", "go.obo.gz") gene_map_file = os.path.join(cf.options.testdir, "raw", "GOnt", "zm_go.tsv.gz") return co.GOnt.from_obo(obo, gene_map_file, "ZmGO", "Maize Gene Ontology", Zm5bFGS) else: return co.GOnt("ZmGO")
def ZmGO(Zm5bFGS): if cf.test.force.Ontology: tools.del_dataset('GOnt', 'ZmGO', force=True) if not tools.available_datasets('GOnt', 'ZmGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'zm_go.tsv.bz2') return co.GOnt.from_obo(obo, gene_map_file, 'ZmGO', 'Maize Gene Ontology', Zm5bFGS) else: return co.GOnt('ZmGO')
def TestGO(Zm5bFGS): if cf.test.force.Ontology: co.del_dataset('GOnt', 'TestGO', force=True) if not co.available_datasets('GOnt', 'TestGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.obo') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.test.tsv') return co.GOnt.from_obo(obo, gene_map_file, 'TestGO', 'Test GO', Zm5bFGS) else: return co.GOnt('TestGO')
def AtGO(AtTair10): if cf.test.force.Ontology: tools.del_dataset('GOnt', 'AtGO', force=True) if not tools.available_datasets('GOnt', 'AtGO'): obo = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'go.obo.bz2') gene_map_file = os.path.join(cf.options.testdir, 'raw', 'GOnt', 'ath_go.tsv.bz2') return co.GOnt.from_obo(obo, gene_map_file, 'AtGO', 'Arabidopsis Gene Ontology', AtTair10, id_col=0, go_col=5) else: return co.GOnt('AtGO')
def list_command(args): if args.type != None and args.name != None: if args.terms: if args.type == 'GWAS': gwas = co.GWAS(args.name) print('\n'.join([x.id for x in gwas.iter_terms()])) elif args.type =='GOnt': gont = co.GOnt(args.name) print('\n'.join([x.id for x in gont.iter_terms()])) else: print(co.available_datasets(args.type,args.name)) elif args.type != None and args.name == None: args.name = '%' print(co.available_datasets(args.type,args.name).to_string()) else: args.type = '%' args.name = '%' print(co.available_datasets(args.type,args.name).to_string())
def list_command(args): if args.type != None and args.name != None: if args.terms: if args.type == "GWAS": gwas = co.GWAS(args.name) print("\n".join([x.id for x in gwas.iter_terms()])) elif args.type == "GOnt": gont = co.GOnt(args.name) print("\n".join([x.id for x in gont.iter_terms()])) if args.names: print(" ".join(available_datasets(args.type, args.name).Name)) else: print(available_datasets(args.type, args.name)) elif args.type != None and args.name == None: args.name = "%" print(available_datasets(args.type, args.name).to_string()) else: args.type = "%" args.name = "%" print(available_datasets(args.type, args.name).to_string())
def from_CLI(cls, args): """ Implements an interface for the CLI to perform overlap Analysis """ if args.genes != [None]: source = "genes" elif args.go is not None: source = "go" elif args.gwas is not None: source = "gwas" elif args.ontology is not None: source = 'ontology' self = cls.create(source+'_CLI', description="CLI Overlap") self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == "genes": # Be smart about this import re args.genes = list(chain(*[re.split("[,; ]", x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = "GeneList" args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "go": self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == "gwas": self.ont = co.GWAS(args.gwas) elif source == 'ontology': self.ont = co.Ontology(args.ontology) else: raise ValueError( "Please provide a valid overlap source (--genes, --go or --gwas)" ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if "strongest" in self.args.snp2gene: if not (self.ont._global("strongest_attr") == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not ( bool(int(self.ont._global("strongest_higher"))) == bool(args.strongest_higher) ): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == "genes": # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log("Some input genes not in network") terms = [Term("CustomTerm", desc="Custom from CLI", loci=loci)] else: # Generate terms from the ontology if "all" in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log( " ---------- Calculating overlap for {} of {} Terms", i, num_total_terms ) if term.id in self.args.skip_terms: self.cob.log("Skipping {} since it was in --skip-terms", term.id) self.cob.log("Generating SNP-to-gene mapping") # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log("Not enough genes to perform overlap") continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log("Too many genes to perform overlap") continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci), ) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log("Generating bootstraps") bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density self.cob.log("Calculating Z-Scores") if bs_std != 0: overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap["zscore"] = bootstraps["zscore"] = 0 # Calculate FDR self.cob.log("Calculating FDR") overlap["fdr"] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = ( bootstraps.groupby("iter") .apply(lambda df: sum(df.zscore >= zscore)) .mean() ) num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, "fdr"] = fdr overlap.loc[overlap.zscore >= zscore, "num_real"] = num_real overlap.loc[overlap.zscore >= zscore, "num_random"] = num_random overlap.loc[overlap.zscore >= zscore, "bs_mean"] = bs_mean overlap.loc[overlap.zscore >= zscore, "bs_std"] = bs_std overlap.sort_values(by=["zscore"], ascending=False, inplace=True) overlap_pval = ( sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean() ) ) / len(bootstraps.iter.unique()) # This gets collated into all_results below overlap["COB"] = self.cob.name overlap["Ontology"] = self.ont.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["TermLoci"] = len(term.loci) overlap["TermCollapsedLoci"] = len(loci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method overlap["SNP2Gene"] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == "density": overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean()) ) elif self.args.method == "locality": overlap_score = np.nanmean(overlap.score) self.cob.log( "Overlap Score ({}): {} (p<{})".format( self.args.method, overlap_score, overlap_pval ) ) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep="\t", index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql( "overlap", sqlite3.connect(overlap_object.db.filename), if_exists="append", index=False, )
def cob_health(args): log = coblog() log('\n' '-----------------------\n' ' Network Health \n' '-----------------------\n') cob = co.COB(args.cob) if args.out is None: args.out = '{}_Health'.format(cob.name) log('Plotting Scores ----------------------------------------------------') if not path.exists('{}_CoexPCC_raw.png'.format(args.out)): cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True) else: log('Skipped Raw.') if not path.exists('{}_CoexScore_zscore.png'.format(args.out)): cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False) else: log('Skipped Norm.') log('Plotting Expression ------------------------------------------------') if not path.exists('{}_Expr_raw.png'.format(args.out)): cob.plot('{}_Expr_raw.png'.format(args.out), include_accession_labels=True, raw=True, cluster_method=None) else: log('Skipped raw.') if not path.exists('{}_Expr_norm.png'.format(args.out)): cob.plot('{}_Expr_norm.png'.format(args.out), include_accession_labels=True, raw=False, cluster_method='leaf', cluster_accessions=True) else: log('Skipped norm.') log('Plotting Cluster Expression-----------------------------------------') if not path.exists('{}_Expr_cluster.png'.format(args.out)): cob.plot('{}_Expr_cluster.png'.format(args.out), include_accession_labels=True, raw=False, cluster_accessions=True, avg_by_cluster=True) else: log('Skipped norm.') log('Printing Summary ---------------------------------------------------') if not path.exists('{}.summary.txt'.format(args.out)): with open('{}.summary.txt'.format(args.out), 'w') as OUT: # Print out the network summary cob.summary(file=OUT) else: log('Skipped summary.') log('Printing QC Statistics ---------------------------------------------') if args.refgen is not None: if not path.exists('{}_qc_gene.txt'.format(args.out)): # Print out the breakdown of QC Values refgen = co.RefGen(args.refgen) gene_qc = cob._bcolz('qc_gene') gene_qc = gene_qc[gene_qc.pass_membership] gene_qc['chrom'] = [ 'chr' + str(refgen[x].chrom) for x in gene_qc.index ] gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0) # Add totals at the bottom totals = gene_qc.ix[:, slice(1, None)].apply(sum) totals.name = 'TOTAL' gene_qc = gene_qc.append(totals) gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t') else: log('Skipped QC summary.') #if not path.exists('{}_CisTrans.png'.format(args.out)): # Get trans edges log('Plotting Degree Distribution ---------------------------------------') if not path.exists('{}_DegreeDist.png'.format(args.out)): degree = cob.degree['Degree'].values #Using powerlaw makes run-time warning the first time you use it. #This is still an open issue on the creators github. #The creator recommends removing this warning as long as there is a fit. np.seterr(divide='ignore', invalid='ignore') fit = powerlaw.Fit(degree, discrete=True, xmin=1) # get an axis ax = plt.subplot() # Calculate log ratios t2p = fit.distribution_compare('truncated_power_law', 'power_law') t2e = fit.distribution_compare('truncated_power_law', 'exponential') p2e = fit.distribution_compare('power_law', 'exponential') # Plot! emp = fit.plot_ccdf(ax=ax, color='r', linewidth=3, label='Empirical Data') pwr = fit.power_law.plot_ccdf(ax=ax, color='b', linestyle='--', label='Power law') tpw = fit.truncated_power_law.plot_ccdf(ax=ax, color='k', linestyle='--', label='Truncated Power') exp = fit.exponential.plot_ccdf(ax=ax, color='g', linestyle='--', label='Exponential') #### ax.set_ylabel("p(Degree≥x)") ax.set_xlabel("Degree Frequency") ax.legend(loc='best') plt.title('{} Degree Distribution'.format(cob.name)) # Save Fig try: plt.savefig('{}_DegreeDist.png'.format(args.out)) except FutureWarning as e: # This is a matplotlib bug pass else: log('Skipping Degree Dist.') log('Plotting GO --------------------------------------------------------') if args.go is not None: if not path.exists('{}_GO.csv'.format(args.out)): go = co.GOnt(args.go) term_ids = [] density_emp = [] density_pvals = [] locality_emp = [] locality_pvals = [] term_sizes = [] term_desc = [] terms_tested = 0 if args.max_terms is not None: log('Limiting to {} GO Terms', args.max_terms) terms = go.rand(n=args.max_terms, min_term_size=args.min_term_size, max_term_size=args.max_term_size) else: terms = go.iter_terms(min_term_size=args.min_term_size, max_term_size=args.max_term_size) for term in terms: term.loci = list(filter(lambda x: x in cob, term.loci)) if len(term) < args.min_term_size or len( term) > args.max_term_size: continue #set density value for two tailed go so we only test it once density = cob.density(term.loci) #one tailed vs two tailed test if args.two_tailed_GO is False: #run one tail for only positive values if density > 0: density_emp.append(density) #skip negative density values else: continue #if two_tailed_go is not none else: density_emp.append(density) term_ids.append(term.id) term_sizes.append(len(term)) term_desc.append(str(term.desc)) # ------ Density # Calculate PVals density_bs = np.array([ cob.density(cob.refgen.random_genes(n=len(term.loci))) \ for x in range(args.num_bootstraps) ]) if density > 0: pval = sum(density_bs >= density) / args.num_bootstraps else: pval = sum(density_bs <= density) / args.num_bootstraps density_pvals.append(pval) # ------- Locality locality = cob.locality(term.loci, include_regression=True).resid.mean() locality_emp.append(locality) # Calculate PVals locality_bs = np.array([ cob.locality( cob.refgen.random_genes(n=len(term.loci)), include_regression=True ).resid.mean() \ for x in range(args.num_bootstraps) ]) if locality > 0: pval = sum(locality_bs >= locality) / args.num_bootstraps else: pval = sum(locality_bs <= locality) / args.num_bootstraps locality_pvals.append(pval) # ------------- terms_tested += 1 if terms_tested % 100 == 0 and terms_tested > 0: log('Processed {} terms'.format(terms_tested)) go_enrichment = pd.DataFrame({ 'GOTerm': term_ids, 'desc': term_desc, 'size': term_sizes, 'density': density_emp, 'density_pval': density_pvals, 'locality': locality_emp, 'locality_pval': locality_pvals }) go_enrichment\ .sort_values(by='density_pval',ascending=True)\ .to_csv('{}_GO.csv'.format(args.out),index=False) if terms_tested == 0: log.warn('No GO terms met your min/max gene criteria!') else: go_enrichment = pd.read_table('{}_GO.csv'.format(args.out), sep=',') if not path.exists('{}_GO.png'.format(args.out)): # Convert pvals to log10 with np.errstate(divide='ignore'): # When no bootstraps are more extreme than the term, the minus log pval yields an infinite go_enrichment['density_pval'] = -1 * np.log10( go_enrichment['density_pval']) go_enrichment['locality_pval'] = -1 * np.log10( go_enrichment['locality_pval']) # Fix the infinites so they are plotted max_density = np.max(go_enrichment['density_pval'][np.isfinite( go_enrichment['density_pval'])]) max_locality = np.max( go_enrichment['locality_pval'][np.isfinite( go_enrichment['locality_pval'])]) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment['density_pval'])), 'density_pval'] = max_density + 1 go_enrichment.loc[np.logical_not( np.isfinite(go_enrichment['locality_pval'])), 'locality_pval'] = max_locality + 1 plt.clf() figure, axes = plt.subplots(3, 2, figsize=(12, 12)) # ----------- # Density # ---------- axes[0, 0].scatter(go_enrichment['density'], go_enrichment['density_pval'], alpha=0.05) axes[0, 0].set_xlabel('Empirical Density (Z-Score)') axes[0, 0].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 0].text(min(axes[0, 0].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 0].scatter(go_enrichment['size'], go_enrichment['density_pval'], alpha=0.05) axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 0].set_xlabel('Term Size') axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 0].scatter(go_enrichment['size'], go_enrichment['density'], alpha=0.05) axes[2, 0].scatter(go_enrichment.query('density_pval>1.3')['size'], go_enrichment.query('density_pval>1.3')['density'], alpha=0.05, color='r') axes[2, 0].set_ylabel('Density') axes[2, 0].set_xlabel('Term Size') # ------------ # Do Locality # ------------ axes[0, 1].scatter(go_enrichment['locality'], go_enrichment['locality_pval'], alpha=0.05) axes[0, 1].set_xlabel('Empirical Locality (Residual)') axes[0, 1].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 1].text(min(axes[0, 1].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 1].scatter(go_enrichment['size'], go_enrichment['locality_pval'], alpha=0.05) axes[1, 1].set_xlabel('Term Size') axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 1].scatter(go_enrichment['size'], go_enrichment['locality'], alpha=0.05) axes[2, 1].scatter( go_enrichment.query('locality_pval>1.3')['size'], go_enrichment.query('locality_pval>1.3')['locality'], alpha=0.05, color='r') axes[2, 1].set_ylabel('Density') axes[2, 1].set_xlabel('Term Size') # Save Figure plt.tight_layout() try: plt.savefig('{}_GO.png'.format(args.out)) except FutureWarning as e: pass else: log('Skipping GO Volcano.')
refgen = co.RefGen(ref) if refgen.has_annotations(): print('Processing annotations for {}...'.format(ref)) func_data_db[ref] = refgen func_data_db[ref].export_annotations( os.path.join(conf['scratch'], (ref + '.tsv'))) if hasGWS: geneWordBuilder(ref, [os.path.join(conf['scratch'], (ref + '.tsv'))], [1], ['2 end'], ['tab'], [True]) # Find any GO ontologies we have for the networks we have print('Finding applicable GO Ontologies...') GOnt_db = {} for name in co.Tools.available_datasets('GOnt')['Name']: gont = co.GOnt(name) if gont.refgen.name not in GOnt_db: GOnt_db[gont.refgen.name] = gont # Generate in memory term lists print('Finding all available terms...') terms = {} for name, ont in onts.items(): terms[name] = [] for term in ont.iter_terms(): terms[name].append({ 'name': term.id, 'desc': term.desc, 'snps':
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' if args.genes != [None]: source = 'genes' elif args.go is not None: source = 'go' elif args.gwas is not None: source = 'gwas' self = cls.create(source, description='CLI Overlap') self.source = source self.args = args # Build base camoco objects self.cob = co.COB(args.cob) # Generate the ontology of terms that we are going to look # at the overlap of if source == 'genes': # Be smart about this import re args.genes = list( chain(*[re.split('[,; ]', x) for x in args.genes])) self.ont = pd.DataFrame() self.ont.name = 'GeneList' args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'go': self.ont = co.GOnt(args.go) args.candidate_window_size = 1 args.candidate_flank_limit = 0 elif source == 'gwas': self.ont = co.GWAS(args.gwas) else: raise ValueError( 'Please provide a valid overlap source (--genes, --go or --gwas)' ) try: self.generate_output_name() except ValueError as e: return # Save strongest description arguments if applicable if 'strongest' in self.args.snp2gene: if not (self.ont._global('strongest_attr') == args.strongest_attr): self.ont.set_strongest(attr=args.strongest_attr) if not (bool(int(self.ont._global('strongest_higher'))) == bool( args.strongest_higher)): self.ont.set_strongest(higher=args.strongest_higher) # Generate a terms iterable if self.source == 'genes': # make a single term loci = self.cob.refgen.from_ids(self.args.genes) if len(loci) < len(self.args.genes): self.cob.log('Some input genes not in network') terms = [Term('CustomTerm', desc='Custom from CLI', loci=loci)] else: # Generate terms from the ontology if 'all' in self.args.terms: terms = list(self.ont.iter_terms()) else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] num_total_terms = len(terms) # Iterate through terms and calculate for i, term in enumerate(terms): self.cob.log(' ---------- Calculating overlap for {} of {} Terms', i, num_total_terms) if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) self.cob.log('Generating SNP-to-gene mapping') # If appropriate, generate SNP2Gene Loci if self.args.candidate_flank_limit > 0: loci = self.snp2gene(term, self.ont) else: loci = list(term.loci) for x in loci: x.window = 1 # Filter out terms with insufficient or too many genes if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue self.cob.log('Generating bootstraps') bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density self.cob.log('Calculating Z-Scores') if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR self.cob.log('Calculating FDR') overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) # Summarize results if self.args.method == 'density': overlap_score = np.nanmean(overlap.score) / ( 1 / np.sqrt(overlap.num_trans_edges.mean())) elif self.args.method == 'locality': overlap_score = np.nanmean(overlap.score) self.cob.log('Overlap Score ({}): {} (p<{})'.format( self.args.method, overlap_score, overlap_pval)) if not args.dry_run: # Consolidate results and output to files self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) # Make an actual results object if not exists overlap_object = cls.create(self.ont) # Save the results to the SQLite table self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
def from_CLI(cls, args): """ Implements an interface to the CLI to perform GWAS simulation """ self = cls() # Build the base objects self.args = args # Load camoco objects self.go = co.GOnt(self.args.GOnt) self.cob = co.COB(self.args.cob) self.generate_output_name() # Generate an iterable of GO Terms if "all" in self.args.terms: # Create a list of all terms within the size specification terms = list( self.go.iter_terms( min_term_size=self.args.min_term_size, max_term_size=self.args.max_term_size, )) elif os.path.exists(self.args.terms[0]): # If parameter is a filename, read term name from a filenamie terms = list( [self.go[x.strip()] for x in open(args.terms[0]).readlines()]) else: # Generate terms from a parameter list terms = list([self.go[x] for x in self.args.terms]) # Iterate and calculate log("Simulating GWAS for {} GO Terms", len(terms)) min_term_size = np.min([len(x) for x in terms]) max_term_size = np.max([len(x) for x in terms]) log("All terms are between {} and {} 'SNPs'", min_term_size, max_term_size) results = [] for i, term in enumerate(terms): log("-" * 75) window_size = self.args.candidate_window_size flank_limit = self.args.candidate_flank_limit # Generate a series of densities for parameters num_genes = len([x for x in term.loci if x in self.cob]) eloci = [ x for x in term.effective_loci(window_size=window_size) if x in self.cob ] eloci = self.simulate_missing_candidates(eloci, self.args.percent_mcr) eloci = self.simulate_false_candidates(eloci, self.args.percent_fcr) log( "GWAS Simulation {}: {} ({}/{} genes in {})", i, term.id, len(eloci), num_genes, self.cob.name, ) # Make sure that the number of genes is adequate if num_genes > self.args.max_term_size: log("Too many genes... skipping") continue elif num_genes < self.args.min_term_size: log("Too few genes... skipping") continue elif num_genes == 0: continue # Generate candidate genes from the effecive loci candidates = self.cob.refgen.candidate_genes( eloci, flank_limit=flank_limit) log( "SNP to gene mapping finds {} genes at window:{} bp, " "flanking:{} genes", len(candidates), self.args.candidate_window_size, self.args.candidate_flank_limit, ) overlap = self.overlap(eloci) # Dont bother bootstrapping on terms with overlap score below 0 if overlap.score.mean() < 0: continue bootstraps = self.generate_bootstraps(eloci, overlap) bs_mean = bootstraps.groupby("iter").score.apply(np.mean).mean() bs_std = bootstraps.groupby("iter").score.apply(np.std).mean() # Calculate z scores for density overlap["zscore"] = (overlap.score - bs_mean) / bs_std bootstraps["zscore"] = (bootstraps.score - bs_mean) / bs_std overlap_pval = (sum( bootstraps.groupby("iter").apply(lambda x: x.score.mean()) >= overlap.score.mean())) / len(bootstraps.iter.unique()) # Create a results object overlap["COB"] = self.cob.name overlap["Ontology"] = self.go.name overlap["Term"] = term.id overlap["WindowSize"] = self.args.candidate_window_size overlap["FlankLimit"] = self.args.candidate_flank_limit overlap["FCR"] = args.percent_fcr overlap["MCR"] = args.percent_mcr overlap["NumRealGenes"] = num_genes overlap["NumEffective"] = len(eloci) overlap["NumCandidates"] = len(candidates) overlap["TermSize"] = len(term) overlap["TermCollapsedLoci"] = len(eloci) overlap["TermPValue"] = overlap_pval overlap["NumBootstraps"] = len(bootstraps.iter.unique()) overlap["Method"] = self.args.method results.append(overlap.reset_index()) self.results = pd.concat(results) self.results.to_csv(args.out, sep="\t", index=False)
def from_CLI(cls, args): ''' Implements an interface for the CLI to perform overlap Analysis ''' self = cls.create(args.gwas, description='CLI Overlap') # Build base camoco objects self.args = args self.cob = co.COB(args.cob) if args.go: self.ont = co.GOnt(args.gwas) args.candidate_window_size = 1 args.candidate_flank_limit = 0 else: self.ont = co.GWAS(args.gwas) self.generate_output_name() # Generate a terms iterable if 'all' in self.args.terms: terms = self.ont.iter_terms() else: terms = [self.ont[term] for term in self.args.terms] all_results = list() results = [] # Iterate through terms and calculate for term in terms: if term.id in self.args.skip_terms: self.cob.log('Skipping {} since it was in --skip-terms', term.id) # Generate SNP2Gene Loci loci = self.snp2gene(term) if len(loci) < 2 or len(loci) < args.min_term_size: self.cob.log('Not enough genes to perform overlap') continue if args.max_term_size != None and len(loci) > args.max_term_size: self.cob.log('Too many genes to perform overlap') continue # Send some output to the terminal self.cob.log( "Calculating Overlap for {} of {} in {} with window:{} and flank:{} ({} Loci)", term.id, self.ont.name, self.cob.name, self.args.candidate_window_size, self.args.candidate_flank_limit, len(loci)) if args.dry_run: continue # Do the dirty try: overlap = self.overlap(loci) except DataError as e: continue bootstraps = self.generate_bootstraps(loci, overlap) bs_mean = bootstraps.groupby('iter').score.apply(np.mean).mean() bs_std = bootstraps.groupby('iter').score.apply(np.std).mean() # Calculate z scores for density if bs_std != 0: overlap['zscore'] = (overlap.score - bs_mean) / bs_std bootstraps['zscore'] = (bootstraps.score - bs_mean) / bs_std else: # If there is no variation, make all Z-scores 0 overlap['zscore'] = bootstraps['zscore'] = 0 # Calculate FDR overlap['fdr'] = np.nan max_zscore = int(overlap.zscore.max()) + 1 for zscore in np.arange(0, max_zscore, 0.25): num_random = bootstraps\ .groupby('iter')\ .apply(lambda df: sum(df.zscore >= zscore))\ .mean() num_real = sum(overlap.zscore >= zscore) # Calculate FDR if num_real != 0 and num_random != 0: fdr = num_random / num_real elif num_real != 0 and num_random == 0: fdr = 0 else: fdr = 1 overlap.loc[overlap.zscore >= zscore, 'fdr'] = fdr overlap.loc[overlap.zscore >= zscore, 'num_real'] = num_real overlap.loc[overlap.zscore >= zscore, 'num_random'] = num_random overlap.loc[overlap.zscore >= zscore, 'bs_mean'] = bs_mean overlap.loc[overlap.zscore >= zscore, 'bs_std'] = bs_std overlap.sort_values(by=['zscore'], ascending=False, inplace=True) overlap_pval = ( (sum(bootstraps.groupby('iter').apply(lambda x: x.score.mean()) >= overlap.score.mean()))\ / len(bootstraps.iter.unique()) ) # This gets collated into all_results below overlap['COB'] = self.cob.name overlap['Ontology'] = self.ont.name overlap['Term'] = term.id overlap['WindowSize'] = self.args.candidate_window_size overlap['FlankLimit'] = self.args.candidate_flank_limit overlap['TermLoci'] = len(term.loci) overlap['TermCollapsedLoci'] = len(loci) overlap['TermPValue'] = overlap_pval overlap['NumBootstraps'] = len(bootstraps.iter.unique()) overlap['Method'] = self.args.method overlap['SNP2Gene'] = self.args.snp2gene results.append(overlap.reset_index()) if not args.dry_run: self.results = pd.concat(results) self.results.to_csv(self.args.out, sep='\t', index=None) overlap_object = cls.create(self.ont) overlap_object.results = results self.results.to_sql('overlap', sqlite3.connect(overlap_object.db.filename), if_exists='append', index=False)
sys.exit(2) for opt, arg in opts: if opt in ("-c", "--cob"): cob = arg elif opt in ("-g", "--go"): go = arg elif opt in ("-h", "--help"): usage() sys.exit(2) else: assert False, "unhandled option" # Set the network and GO object cob = co.COB(cob) go = co.GOnt(go) TotGenes = cob.num_genes() # change from np.ndarray to pd dataframe Clusters = pd.DataFrame(cob.clusters) # Make a ordered dictionary to each key # is a cluster and each value is the # genes in that cluster ClustDict = collections.OrderedDict() for index, row in Clusters.iterrows(): if row[0] in ClustDict.keys(): ClustDict[row[0]].append(index) else: ClustDict[row[0]] = [index]
def cob_health(args): log = coblog() log( f"\n" f"-----------------------------\n" f" Network Health:{args.cob} \n" f"-----------------------------\n" ) log(f"\nCreating reports in {os.getcwd()}\n\n") cob = co.COB(args.cob) if args.out is None: args.out = "{}_Health".format(cob.name) log(f"Output prefix: {args.out}") if args.edge_zscore_cutoff is not None: log("Changing Z-Score cutoff to {}", args.edge_zscore_cutoff) cob.set_sig_edge_zscore(args.edge_zscore_cutoff) log("Printing Summary ---------------------------------------------------") if not path.exists("{}.summary.txt".format(args.out)): with open("{}.summary.txt".format(args.out), "w") as OUT: # Print out the network summary cob.summary(file=OUT) else: log("Skipped summary.") log("Plotting Scores ----------------------------------------------------") if not path.exists("{}_CoexPCC_raw.png".format(args.out)): cob.plot_scores("{}_CoexPCC_raw.png".format(args.out), pcc=True) else: log("Skipped Raw.") if not path.exists("{}_CoexScore_zscore.png".format(args.out)): cob.plot_scores("{}_CoexScore_zscore.png".format(args.out), pcc=False) else: log("Skipped Norm.") log("Plotting Expression ------------------------------------------------") # if not path.exists('{}_Expr_raw.png'.format(args.out)): # cob.plot( # '{}_Expr_raw.png'.format(args.out), # include_accession_labels=True, # raw=True, # cluster_method=None # ) # else: # log('Skipped raw.') if not path.exists("{}_Expr_norm.png".format(args.out)): cob.plot_heatmap( "{}_Expr_norm.png".format(args.out), include_accession_labels=True, raw=False, cluster_method="ward", cluster_accessions=True, ) else: log("Skipped norm.") # log('Plotting Cluster Expression-----------------------------------------') # if not path.exists('{}_Expr_cluster.png'.format(args.out)): # cob.plot( # '{}_Expr_cluster.png'.format(args.out), # include_accession_labels=True, # raw=False, # cluster_accessions=True, # avg_by_cluster=True # ) # else: # log('Skipped norm.') log("Printing QC Statistics ---------------------------------------------") if args.refgen is not None: if not path.exists("{}_qc_gene.txt".format(args.out)): # Print out the breakdown of QC Values refgen = co.RefGen(args.refgen) gene_qc = cob._bcolz("qc_gene") gene_qc = gene_qc[gene_qc.pass_membership] gene_qc["chrom"] = ["chr" + str(refgen[x].chrom) for x in gene_qc.index] gene_qc = gene_qc.groupby("chrom").agg(sum, axis=0) # Add totals at the bottom totals = gene_qc.ix[:, slice(1, None)].apply(sum) totals.name = "TOTAL" gene_qc = gene_qc.append(totals) gene_qc.to_csv("{}_qc_gene.txt".format(args.out), sep="\t") else: log("Skipped QC summary.") log("Plotting Degree Distribution ---------------------------------------") if not path.exists("{}_DegreeDist.png".format(args.out)): degree = cob.degree["Degree"].values # Using powerlaw makes run-time warning the first time you use it. # This is still an open issue on the creators github. # The creator recommends removing this warning as long as there is a fit. np.seterr(divide="ignore", invalid="ignore") fit = powerlaw.Fit(degree, discrete=True, xmin=1) # get an axis ax = plt.subplot() # Calculate log ratios t2p = fit.distribution_compare("truncated_power_law", "power_law") t2e = fit.distribution_compare("truncated_power_law", "exponential") p2e = fit.distribution_compare("power_law", "exponential") # Plot! emp = fit.plot_ccdf(ax=ax, color="r", linewidth=3, label="Empirical Data") pwr = fit.power_law.plot_ccdf( ax=ax, linewidth=2, color="b", linestyle=":", label="Power law" ) tpw = fit.truncated_power_law.plot_ccdf( ax=ax, linewidth=2, color="k", linestyle="-.", label="Truncated Power" ) exp = fit.exponential.plot_ccdf( ax=ax, linewidth=2, color="g", linestyle="--", label="Exponential" ) #### ax.set_ylabel("p(Degree≥x)") ax.set_xlabel("Degree Frequency") ax.legend(loc="best") plt.title("{} Degree Distribution".format(cob.name)) # Save Fig try: plt.savefig("{}_DegreeDist.png".format(args.out)) except FutureWarning as e: # This is a matplotlib bug pass else: log("Skipping Degree Dist.") if args.go is not None: log("Plotting GO --------------------------------------------------------") # Set the alpha based on the tails if args.two_tailed == True: alpha = 0.05 / 2 else: alpha = 0.05 # Generate the GO Table if not path.exists("{}_GO.csv".format(args.out)): go = co.GOnt(args.go) term_ids = [] density_emp = [] density_pvals = [] locality_emp = [] locality_pvals = [] term_sizes = [] term_desc = [] terms_tested = 0 # max_terms limits the number of GO terms tested (sub-sampling) if args.max_terms is not None: log("Limiting to {} GO Terms", args.max_terms) terms = go.rand( n=args.max_terms, min_term_size=args.min_term_size, max_term_size=args.max_term_size, ) else: # Else do the whole set (default is terms between 10 and 300 genes) terms = go.iter_terms( min_term_size=args.min_term_size, max_term_size=args.max_term_size ) for term in terms: # Some terms will lose genes that are not in networks term.loci = list(filter(lambda x: x in cob, term.loci)) # Skip terms that are not an adequate size if len(term) < args.min_term_size or len(term) > args.max_term_size: continue # set density value for two tailed go so we only test it once density = cob.density(term.loci) # one tailed vs two tailed test density_emp.append(density) # term_ids.append(term.id) term_sizes.append(len(term)) term_desc.append(str(term.desc)) # ------ Density # Calculate PVals density_bs = np.array( [ cob.density(cob.refgen.random_genes(n=len(term.loci))) for x in range(args.num_bootstraps) ] ) if density > 0: pval = sum(density_bs >= density) / args.num_bootstraps else: pval = sum(density_bs <= density) / args.num_bootstraps density_pvals.append(pval) # ------- Locality locality = cob.locality(term.loci, include_regression=True).resid.mean() locality_emp.append(locality) # Calculate PVals locality_bs = np.array( [ cob.locality( cob.refgen.random_genes(n=len(term.loci)), include_regression=True, ).resid.mean() for x in range(args.num_bootstraps) ] ) if locality > 0: pval = sum(locality_bs >= locality) / args.num_bootstraps else: pval = sum(locality_bs <= locality) / args.num_bootstraps locality_pvals.append(pval) # ------------- terms_tested += 1 if terms_tested % 100 == 0 and terms_tested > 0: log("Processed {} terms".format(terms_tested)) go_enrichment = pd.DataFrame( { "GOTerm": term_ids, "desc": term_desc, "size": term_sizes, "density": density_emp, "density_pval": density_pvals, "locality": locality_emp, "locality_pval": locality_pvals, } ) # Calculate significance go_enrichment["density_significant"] = go_enrichment.density_pval < alpha go_enrichment["locality_significant"] = go_enrichment.locality_pval < alpha # Calculate bonferonni go_enrichment["density_bonferroni"] = go_enrichment.density_pval < ( alpha / len(go_enrichment) ) go_enrichment["locality_bonferroni"] = go_enrichment.locality_pval < ( alpha / len(go_enrichment) ) # Store the GO results in a CSV go_enrichment.sort_values(by="density_pval", ascending=True).to_csv( "{}_GO.csv".format(args.out), index=False ) if terms_tested == 0: log.warn("No GO terms met your min/max gene criteria!") else: go_enrichment = pd.read_table("{}_GO.csv".format(args.out), sep=",") if not path.exists("{}_GO.png".format(args.out)): # Convert pvals to log10 with np.errstate(divide="ignore"): # When no bootstraps are more extreme than the term, the minus log pval yields an infinite go_enrichment["density_pval"] = -1 * np.log10( go_enrichment["density_pval"] ) go_enrichment["locality_pval"] = -1 * np.log10( go_enrichment["locality_pval"] ) # Fix the infinites so they are plotted max_density = np.max( go_enrichment["density_pval"][ np.isfinite(go_enrichment["density_pval"]) ] ) max_locality = np.max( go_enrichment["locality_pval"][ np.isfinite(go_enrichment["locality_pval"]) ] ) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment["density_pval"])), "density_pval", ] = (max_density + 1) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment["locality_pval"])), "locality_pval", ] = (max_locality + 1) plt.clf() # Calculate the transparency based on the number of terms if len(go_enrichment) > 20: transparency_alpha = 0.05 else: transparency_alpha = 1 # -------------------------------------------------------------------- # Start Plotting figure, axes = plt.subplots(3, 2, figsize=(12, 12)) # ----------- # Density # ---------- log_alpha = -1 * np.log10(alpha) axes[0, 0].scatter( go_enrichment["density"], go_enrichment["density_pval"], alpha=transparency_alpha, color="blue", ) axes[0, 0].set_xlabel("Empirical Density (Z-Score)") axes[0, 0].set_ylabel("Bootstraped -log10(p-value)") fold = sum(np.array(go_enrichment["density_pval"]) > log_alpha) / ( alpha * len(go_enrichment) ) axes[0, 0].axhline(y=-1 * np.log10(0.05), color="red") axes[0, 0].text( min(axes[0, 0].get_xlim()), -1 * np.log10(alpha) + 0.1, "{:.3g} Fold Enrichement".format(fold), color="red", ) axes[0, 0].set_title("Density Health") # Plot pvalue by term size axes[1, 0].scatter( go_enrichment["size"], go_enrichment["density_pval"], alpha=transparency_alpha, color="blue", ) axes[1, 0].set_ylabel("Bootstrapped -log10(p-value)") axes[1, 0].set_xlabel("Term Size") axes[1, 0].axhline(y=-1 * np.log10(alpha), color="red") axes[2, 0].scatter( go_enrichment["size"], go_enrichment["density"], alpha=transparency_alpha, color="blue", ) # Plot raw density by term size axes[2, 0].scatter( go_enrichment.query(f"density_pval>{log_alpha}")["size"], go_enrichment.query(f"density_pval>{log_alpha}")["density"], alpha=transparency_alpha, color="r", ) axes[2, 0].set_ylabel("Density") axes[2, 0].set_xlabel("Term Size") # ------------ # Do Locality # ------------ axes[0, 1].scatter( go_enrichment["locality"], go_enrichment["locality_pval"], alpha=transparency_alpha, color="blue", ) axes[0, 1].set_xlabel("Empirical Locality (Residual)") axes[0, 1].set_ylabel("Bootstraped -log10(p-value)") fold = sum(np.array(go_enrichment["locality_pval"]) > log_alpha) / ( 0.05 * len(go_enrichment) ) axes[0, 1].axhline(y=-1 * np.log10(0.05), color="red") axes[0, 1].text( min(axes[0, 1].get_xlim()), -1 * np.log10(alpha), "{:.3g} Fold Enrichement".format(fold), color="red", ) axes[0, 1].set_title("Locality Health") axes[1, 1].scatter( go_enrichment["size"], go_enrichment["locality_pval"], alpha=transparency_alpha, color="blue", ) axes[1, 1].set_xlabel("Term Size") axes[1, 1].set_ylabel("Bootstrapped -log10(p-value)") axes[1, 1].axhline(y=-1 * np.log10(0.05), color="red") axes[2, 1].scatter( go_enrichment["size"], go_enrichment["locality"], alpha=transparency_alpha, color="blue", ) axes[2, 1].scatter( go_enrichment.query(f"locality_pval>{log_alpha}")["size"], go_enrichment.query(f"locality_pval>{log_alpha}")["locality"], alpha=transparency_alpha, color="r", ) axes[2, 1].set_ylabel("Locality") axes[2, 1].set_xlabel("Term Size") # Save Figure plt.tight_layout() try: plt.savefig("{}_GO.png".format(args.out)) except FutureWarning as e: pass else: log("Skipping GO Volcano.")