def Zm5bFGS(): if cf.test.force.RefGen: tools.del_dataset('RefGen', 'Zm5bFGS', force=True) if not tools.available_datasets('RefGen', 'Zm5bFGS'): # We have to build it gff = os.path.expanduser( os.path.join(cf.options.testdir, 'raw', 'RefGen', 'ZmB73_5b_FGS.gff.gz')) # This is stupid and necessary because pytables wont let me open # more than one table co.RefGen.from_gff(gff, 'Zm5bFGS', 'Maize 5b Filtered Gene Set', '5b', 'Zea Mays') return co.RefGen('Zm5bFGS')
def Zm5bFGS(): if cf.test.force.RefGen: tools.del_dataset("RefGen", "Zm5bFGS", force=True) if not tools.available_datasets("RefGen", "Zm5bFGS"): # We have to build it gff = os.path.expanduser( os.path.join(cf.options.testdir, "raw", "RefGen", "ZmB73_5b_FGS.gff.gz")) # This is stupid and necessary because pytables wont let me open # more than one table co.RefGen.from_gff(gff, "Zm5bFGS", "Maize 5b Filtered Gene Set", "5b", "Zea Mays") return co.RefGen("Zm5bFGS")
def ZmRoot(self): co.del_dataset('Expr','ZmRoot',safe=False) ZM = co.RefGen('Zm5bFGS') ZmRoot = co.COB.from_table( os.path.join(cf.get('options','testdir'),'raw','Expression','ROOTFPKM.tsv'), 'ZmRoot', 'Maize Root Network', ZM, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=0.1, dry_run=False, max_val=300 )
def build_gont(args): refgen = co.RefGen(args.refgen) # Check to see if this dataset is already built if co.available_datasets('GOnt', args.name): print('Warning! This dataset has already been built.') co.del_dataset('GOnt', args.name, force=args.force) go = co.GOnt.from_obo(args.obo_filename, args.filename, args.name, args.description, refgen, go_col=args.go_col, id_col=args.id_col) print("Done: {}".format(go.summary())) print('Build Successful')
def build_cob(args): # Build the refgen refgen = co.RefGen(args.refgen) # Check that the sep is likely right. if len(pd.read_table(args.filename, sep=args.sep).columns) == 1: print( ("Detected only 1 column in {}, are you sure " "colunms are separated by '{}'?").format(args.filename, args.sep)) return None if args.allow_non_membership: refgen = refgen.copy('{}_tmp'.format(refgen.name), 'temp refgen'.format(refgen.name)) # Add non membership genes for gid in pd.read_table(args.filename, sep=args.sep).index: refgen.add_gene(Gene(None, None, id=gid)) quality_control = False if args.skip_quality_control else True normalize = False if args.skip_normalization else True # Check to see if this dataset is already built if co.available_datasets('Expr', args.name): print('Warning! This dataset has already been built.') co.del_dataset('Expr', args.name, safe=args.force) # Basically just pass all the CLI arguments to the COB class method cob = co.COB.from_table( args.filename, args.name, args.description, refgen, # Optional arguments sep=args.sep, rawtype=args.rawtype, # Data Processing quality_control=quality_control, normalization=normalize, quantile=args.quantile, # Data processing parameters max_gene_missing_data=args.max_gene_missing_data, max_accession_missing_data=args.max_accession_missing_data, min_single_sample_expr=args.min_single_sample_expr, min_expr=args.min_expr, max_val=args.max_val, dry_run=args.dry_run, index_col=args.index_col) print("Build successful!") print(cob.summary())
def ZmSAM(self): co.del_dataset('Expr','ZmSAM',safe=False) ZM = co.RefGen('Zm5bFGS') ZmSAM = co.COB.from_table( os.path.join( cf.get('options','testdir'),'raw','Expression', 'TranscriptomeProfiling_B73_Atlas_SAM_FGS_LiLin_20140316.txt' ), 'ZmSAM', 'Maize Root Network', ZM, rawtype='RNASEQ', max_gene_missing_data=0.4, min_expr=0.1, dry_run=False, max_val=300 )
def build_GWAS(args): df = pd.DataFrame.from_csv(args.filename, sep=args.sep).reset_index() if len(df.columns) == 1: raise ValueError("Only 1 column found, check --sep, see --help") print('Loading {}'.format(args.refgen)) refgen = co.RefGen(args.refgen) # Filter out traits that are in args.skip_trait df = df[[x not in args.skip_traits for x in df[args.trait_col]]] # Build gwas = co.GWAS.from_DataFrame(df, args.name, args.description, refgen, term_col=args.trait_col, chr_col=args.chrom_col, pos_col=args.pos_col) print("Build Successful:") print(gwas.summary())
def build_gont(args): refgen = co.RefGen(args.refgen) # Check to see if this dataset is already built if available_datasets("GOnt", args.name): print("Warning! This dataset has already been built.") co.Tools.del_dataset("GOnt", args.name, force=args.force) go = co.GOnt.from_obo( args.obo_filename, args.filename, args.name, args.description, refgen, go_col=args.go_col, id_col=args.id_col, headers=args.gene_term_header, ) print("Done: {}".format(go.summary())) print("Build Successful")
def AtSeed(self): Seed = ['GSE12404',#'GSE30223', 'GSE1051','GSE11852','GSE5634'] SeedFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Seed ]) #SeedFam.to_keepfile("SeedKeep.tsv",keep_hint='seed') AtSeed = co.COB.from_DataFrame(SeedFam.series_matrix(keepfile="raw/GSE/SeedKeep.tsv"),'AtSeed','Arabidopsis Seed',co.RefGen('Tair10'),rawtype='MICROARRAY')
def build_cob(args): try: # Build the refgen refgen = co.RefGen(args.refgen) # Check that the sep is likely right. if len(pd.read_table(args.filename, sep=args.sep).columns) == 1: print(("Detected only 1 column in {}, are you sure " "colunms are separated by '{}'?").format( args.filename, args.sep)) return None elif len(pd.read_table( args.filename, sep=args.sep).columns) < 20 and args.non_interactive != True: print(( "Detected fewer than 20 accessions in the expression matrix. " "Calculating co-expression with this many datapoints is not advised" )) if input('are you sure you want to continue? [y/n]: ').upper( ) == 'Y': pass else: sys.exit(1) if args.allow_non_membership: refgen = refgen.copy('{}_tmp'.format(refgen.name), 'temp refgen'.format(refgen.name)) # Add non membership genes for gid in pd.read_table(args.filename, sep=args.sep).index: refgen.add_gene(Gene(None, None, id=gid)) quality_control = False if args.skip_quality_control else True normalize = False if args.skip_normalization else True quantile = False if args.skip_quantile else True # Check to see if this dataset is already built if available_datasets('Expr', args.name): print('Warning! This dataset has already been built.') co.Tools.del_dataset('Expr', args.name, force=args.force) # Basically just pass all the CLI arguments to the COB class method cob = co.COB.from_table( args.filename, args.name, args.description, refgen, # Optional arguments sep=args.sep, rawtype=args.rawtype, # Data Processing quality_control=quality_control, normalization=normalize, quantile=quantile, # Data processing parameters max_gene_missing_data=args.max_gene_missing_data, max_accession_missing_data=args.max_accession_missing_data, min_single_sample_expr=args.min_single_sample_expr, min_expr=args.min_expr, max_val=args.max_val, dry_run=args.dry_run, zscore_cutoff=args.zscore_cutoff, index_col=args.index_col) print(cob.summary()) except Exception as e: print("Build failed. Rolling back: removing corrupted files...") co.Tools.del_dataset('Expr', args.name, force=True) raise e
def getNodes(genes, cob, term, primary=None, render=None, gwasData=pd.DataFrame(), nodeCutoff=0, windowSize=None, flankLimit=None, fdrCutoff=None, hpo=False): # Cache the locality locality = cob.locality(genes) # Containers for the node info nodes = {} parent_set = set() # Look for alises aliases = co.RefGen(cob._global('parent_refgen')).aliases( [gene.id for gene in genes]) # Look for annotations if cob._global('parent_refgen') in func_data_db: func_data = func_data_db[cob._global('parent_refgen')].get_annotations( [gene.id for gene in genes]) else: func_data = {} # Pre cache a list of the contained genes gwasDataGenes = set() if not gwasData.empty: gwasDataGenes = set(gwasData['gene']) for gene in genes: # Catch for translating the way camoco works to the way We need for COB try: ldegree = locality.ix[gene.id]['local'] gdegree = locality.ix[gene.id]['global'] except KeyError as e: ldegree = gdegree = 'nan' # Catch for bug in camoco try: numInterv = str(gene.attr['num_intervening']) rankIntervening = str(gene.attr['intervening_rank']) numSiblings = str(gene.attr['num_siblings']) except KeyError as e: #print('Num Attr fail on gene: ' + str(gene.id)) numInterv = '-' rankIntervening = '-' numSiblings = '-' # Pull any aliases from our database alias = '' if gene.id in aliases: for a in aliases[gene.id]: alias += a + ' ' # Fetch the FDR if we can fdr = np.nan if gene.id in gwasDataGenes: fdr = gwasData[gwasData['gene'] == gene.id]['fdr'].min() # Pull any annotations from our databases anote = '' if gene.id in func_data: for a in func_data[gene.id]: anote += a + ' ' # Fetch parent locus if we can if 'parent_locus' not in gene.attr: gene.attr['parent_locus'] = '[Unknown]{}:{}-{}'.format( gene.chrom, gene.start, gene.end) # Build the data object from our data node = { 'group': 'nodes', 'data': { 'id': gene.id, 'type': 'gene', 'render': False, 'term': term, 'snp': gene.attr['parent_locus'].replace('<', '[').replace('>', ']'), 'alias': alias, 'origin': 'N/A', 'chrom': str(gene.chrom), 'start': str(gene.start), 'end': str(gene.end), 'cur_ldegree': str(0), 'ldegree': str(ldegree), 'gdegree': str(gdegree), 'fdr': 'HPO' if hpo else str(fdr), 'windowSize': str(windowSize), 'flankLimit': str(flankLimit), 'numIntervening': numInterv, 'rankIntervening': rankIntervening, 'numSiblings': numSiblings, # 'parentNumIterations': str(gene.attr['parent_numIterations']), # 'parentAvgEffectSize': str(gene.attr['parent_avgEffectSize']), 'annotations': anote, } } # Denote the query genes if primary: if gene.id in primary: node['data']['origin'] = 'query' else: node['data']['origin'] = 'neighbor' # Denote whether or not to render it if ldegree >= nodeCutoff: if (not fdrCutoff) or gwasData.empty or fdr <= fdrCutoff: if (not render) or (gene.id in render): node['data']['render'] = True # Save the node to the list nodes[gene.id] = node return nodes
onts_info[net.name] = [] for n, ont in onts.items(): if ont.refgen.name == ref: onts_info[net.name].append({ 'name': ont.name, 'refgen': ont.refgen.name, 'desc': ont.description }) print('Availible GWASes: ' + str(onts_info)) # Prefetch the gene names for all the networks print('Fetching gene names for networks...') network_genes = {} for name, net in networks.items(): ids = list(net._expr.index.values) als = co.RefGen(net._global('parent_refgen')).aliases(ids) for k, v in als.items(): ids += v network_genes[name] = list(set(ids)) print('Found gene names') # Find all of the GWAS data we have available print('Finding GWAS Data...') gwas_data_db = {} for gwas in co.Tools.available_datasets('Overlap')['Name']: print("Loading {}".format(gwas)) gwas_data_db[gwas] = co.Overlap(gwas) # Find the available window sizes and flank limits for each GWAS/COB combo print('Finding GWAS Metadata...') gwas_meta_db = {}
import camoco as co # read refgen ZMFGS = co.RefGen("Zm5bFGS") # create KLS network ZmTissueNetwork = co.COB.from_table( 'data/splits/KSS.txt', 'KSS_T', # Dataset Name 'Co-expression network for all KLS annotated samples', # Short Description ZMFGS, #A RefGen instance rawtype='RNASEQ', # Expression datatype, either 'RNASEQ' or 'MICROARRAY' max_gene_missing=0.4, # See Expr._quality_control min_expr=0.1, # See Expr._quality_control quantile=False, # See Expr._quality_control dry_run=False, # See Expr._quality_control sep=',', # table is comma seperated max_val=300, # See Expr._normalize )
def getNodes(genes, cob, term, primary=None, render=None, gwas_data=pd.DataFrame(), nodeCutoff=0): # Cache the locality locality = cob.locality(genes) # Containers for the node info nodes = [] parent_set = set() # Look for alises aliases = co.RefGen(cob._global('parent_refgen')).aliases([gene.id for gene in genes]) # Look for annotations if cob._global('parent_refgen') in func_data_db: func_data = func_data_db[cob._global('parent_refgen')][[gene.id for gene in genes]] else: func_data = {} for gene in genes: # Catch for translating the way camoco works to the way We need for COB try: local_degree = locality.ix[gene.id]['local'] global_degree = locality.ix[gene.id]['global'] except KeyError as e: local_degree = global_degree = 0 # Catch for bug in camoco try: num_interv = str(gene.attr['num_intervening']) except KeyError as e: #print('Num Attr fail on gene: ' + str(gene.id)) num_interv = 'NAN' # Pull any aliases from our database alias = '' if gene.id in aliases: for a in aliases[gene.id]: alias += a + ' ' # Fetch the FDR if we can fdr = np.nan if gene.id in gwas_data.index: fdr = gwas_data.loc[gene.id]['fdr'] # Pull any annotations from our databases anote = '' if gene.id in func_data: for a in func_data[gene.id]: anote += a + ' ' # Build the data object from our data node = {'group':'nodes', 'data':{ 'id': gene.id, 'type': 'gene', 'render': 'x', 'term': term, 'snp': gene.attr['parent_locus'], 'alias': alias, 'origin': 'N/A', 'chrom': str(gene.chrom), 'start': str(gene.start), 'end': str(gene.end), 'cur_ldegree': str(0), 'ldegree': str(local_degree), 'gdegree': str(global_degree), 'fdr': str(fdr), 'num_intervening': num_interv, 'rank_intervening': str(gene.attr['intervening_rank']), 'num_siblings': str(gene.attr['num_siblings']), #'parent_num_iterations': str(gene.attr['parent_numIterations']), #'parent_avg_effect_size': str(gene.attr['parent_avgEffectSize']), 'annotations': anote, }} # Denote the query genes if primary: if gene.id in primary: node['data']['origin'] = 'query' else: node['data']['origin'] = 'neighbor' # Denote whether or not to render it if there is a list if render: if (gene.id in render) and (local_degree >= nodeCutoff): node['data']['render'] = 'x' else: node['data']['render'] = ' ' # Save the node to the list nodes.append(node) else: if local_degree >= nodeCutoff: node['data']['render'] = 'x' nodes.append(node) return nodes
def cob_health(args): log = coblog() log('\n' '-----------------------\n' ' Network Health \n' '-----------------------\n') cob = co.COB(args.cob) if args.out is None: args.out = '{}_Health'.format(cob.name) log('Plotting Scores ----------------------------------------------------') if not path.exists('{}_CoexPCC_raw.png'.format(args.out)): cob.plot_scores('{}_CoexPCC_raw.png'.format(args.out), pcc=True) else: log('Skipped Raw.') if not path.exists('{}_CoexScore_zscore.png'.format(args.out)): cob.plot_scores('{}_CoexScore_zscore.png'.format(args.out), pcc=False) else: log('Skipped Norm.') log('Plotting Expression ------------------------------------------------') if not path.exists('{}_Expr_raw.png'.format(args.out)): cob.plot('{}_Expr_raw.png'.format(args.out), include_accession_labels=True, raw=True, cluster_method=None) else: log('Skipped raw.') if not path.exists('{}_Expr_norm.png'.format(args.out)): cob.plot('{}_Expr_norm.png'.format(args.out), include_accession_labels=True, raw=False, cluster_method='leaf', cluster_accessions=True) else: log('Skipped norm.') log('Plotting Cluster Expression-----------------------------------------') if not path.exists('{}_Expr_cluster.png'.format(args.out)): cob.plot('{}_Expr_cluster.png'.format(args.out), include_accession_labels=True, raw=False, cluster_accessions=True, avg_by_cluster=True) else: log('Skipped norm.') log('Printing Summary ---------------------------------------------------') if not path.exists('{}.summary.txt'.format(args.out)): with open('{}.summary.txt'.format(args.out), 'w') as OUT: # Print out the network summary cob.summary(file=OUT) else: log('Skipped summary.') log('Printing QC Statistics ---------------------------------------------') if args.refgen is not None: if not path.exists('{}_qc_gene.txt'.format(args.out)): # Print out the breakdown of QC Values refgen = co.RefGen(args.refgen) gene_qc = cob._bcolz('qc_gene') gene_qc = gene_qc[gene_qc.pass_membership] gene_qc['chrom'] = [ 'chr' + str(refgen[x].chrom) for x in gene_qc.index ] gene_qc = gene_qc.groupby('chrom').agg(sum, axis=0) # Add totals at the bottom totals = gene_qc.ix[:, slice(1, None)].apply(sum) totals.name = 'TOTAL' gene_qc = gene_qc.append(totals) gene_qc.to_csv('{}_qc_gene.txt'.format(args.out), sep='\t') else: log('Skipped QC summary.') #if not path.exists('{}_CisTrans.png'.format(args.out)): # Get trans edges log('Plotting Degree Distribution ---------------------------------------') if not path.exists('{}_DegreeDist.png'.format(args.out)): degree = cob.degree['Degree'].values #Using powerlaw makes run-time warning the first time you use it. #This is still an open issue on the creators github. #The creator recommends removing this warning as long as there is a fit. np.seterr(divide='ignore', invalid='ignore') fit = powerlaw.Fit(degree, discrete=True, xmin=1) # get an axis ax = plt.subplot() # Calculate log ratios t2p = fit.distribution_compare('truncated_power_law', 'power_law') t2e = fit.distribution_compare('truncated_power_law', 'exponential') p2e = fit.distribution_compare('power_law', 'exponential') # Plot! emp = fit.plot_ccdf(ax=ax, color='r', linewidth=3, label='Empirical Data') pwr = fit.power_law.plot_ccdf(ax=ax, color='b', linestyle='--', label='Power law') tpw = fit.truncated_power_law.plot_ccdf(ax=ax, color='k', linestyle='--', label='Truncated Power') exp = fit.exponential.plot_ccdf(ax=ax, color='g', linestyle='--', label='Exponential') #### ax.set_ylabel("p(Degreeā„x)") ax.set_xlabel("Degree Frequency") ax.legend(loc='best') plt.title('{} Degree Distribution'.format(cob.name)) # Save Fig try: plt.savefig('{}_DegreeDist.png'.format(args.out)) except FutureWarning as e: # This is a matplotlib bug pass else: log('Skipping Degree Dist.') log('Plotting GO --------------------------------------------------------') if args.go is not None: if not path.exists('{}_GO.csv'.format(args.out)): go = co.GOnt(args.go) term_ids = [] density_emp = [] density_pvals = [] locality_emp = [] locality_pvals = [] term_sizes = [] term_desc = [] terms_tested = 0 if args.max_terms is not None: log('Limiting to {} GO Terms', args.max_terms) terms = go.rand(n=args.max_terms, min_term_size=args.min_term_size, max_term_size=args.max_term_size) else: terms = go.iter_terms(min_term_size=args.min_term_size, max_term_size=args.max_term_size) for term in terms: term.loci = list(filter(lambda x: x in cob, term.loci)) if len(term) < args.min_term_size or len( term) > args.max_term_size: continue #set density value for two tailed go so we only test it once density = cob.density(term.loci) #one tailed vs two tailed test if args.two_tailed_GO is False: #run one tail for only positive values if density > 0: density_emp.append(density) #skip negative density values else: continue #if two_tailed_go is not none else: density_emp.append(density) term_ids.append(term.id) term_sizes.append(len(term)) term_desc.append(str(term.desc)) # ------ Density # Calculate PVals density_bs = np.array([ cob.density(cob.refgen.random_genes(n=len(term.loci))) \ for x in range(args.num_bootstraps) ]) if density > 0: pval = sum(density_bs >= density) / args.num_bootstraps else: pval = sum(density_bs <= density) / args.num_bootstraps density_pvals.append(pval) # ------- Locality locality = cob.locality(term.loci, include_regression=True).resid.mean() locality_emp.append(locality) # Calculate PVals locality_bs = np.array([ cob.locality( cob.refgen.random_genes(n=len(term.loci)), include_regression=True ).resid.mean() \ for x in range(args.num_bootstraps) ]) if locality > 0: pval = sum(locality_bs >= locality) / args.num_bootstraps else: pval = sum(locality_bs <= locality) / args.num_bootstraps locality_pvals.append(pval) # ------------- terms_tested += 1 if terms_tested % 100 == 0 and terms_tested > 0: log('Processed {} terms'.format(terms_tested)) go_enrichment = pd.DataFrame({ 'GOTerm': term_ids, 'desc': term_desc, 'size': term_sizes, 'density': density_emp, 'density_pval': density_pvals, 'locality': locality_emp, 'locality_pval': locality_pvals }) go_enrichment\ .sort_values(by='density_pval',ascending=True)\ .to_csv('{}_GO.csv'.format(args.out),index=False) if terms_tested == 0: log.warn('No GO terms met your min/max gene criteria!') else: go_enrichment = pd.read_table('{}_GO.csv'.format(args.out), sep=',') if not path.exists('{}_GO.png'.format(args.out)): # Convert pvals to log10 with np.errstate(divide='ignore'): # When no bootstraps are more extreme than the term, the minus log pval yields an infinite go_enrichment['density_pval'] = -1 * np.log10( go_enrichment['density_pval']) go_enrichment['locality_pval'] = -1 * np.log10( go_enrichment['locality_pval']) # Fix the infinites so they are plotted max_density = np.max(go_enrichment['density_pval'][np.isfinite( go_enrichment['density_pval'])]) max_locality = np.max( go_enrichment['locality_pval'][np.isfinite( go_enrichment['locality_pval'])]) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment['density_pval'])), 'density_pval'] = max_density + 1 go_enrichment.loc[np.logical_not( np.isfinite(go_enrichment['locality_pval'])), 'locality_pval'] = max_locality + 1 plt.clf() figure, axes = plt.subplots(3, 2, figsize=(12, 12)) # ----------- # Density # ---------- axes[0, 0].scatter(go_enrichment['density'], go_enrichment['density_pval'], alpha=0.05) axes[0, 0].set_xlabel('Empirical Density (Z-Score)') axes[0, 0].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['density_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 0].text(min(axes[0, 0].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 0].scatter(go_enrichment['size'], go_enrichment['density_pval'], alpha=0.05) axes[1, 0].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 0].set_xlabel('Term Size') axes[1, 0].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 0].scatter(go_enrichment['size'], go_enrichment['density'], alpha=0.05) axes[2, 0].scatter(go_enrichment.query('density_pval>1.3')['size'], go_enrichment.query('density_pval>1.3')['density'], alpha=0.05, color='r') axes[2, 0].set_ylabel('Density') axes[2, 0].set_xlabel('Term Size') # ------------ # Do Locality # ------------ axes[0, 1].scatter(go_enrichment['locality'], go_enrichment['locality_pval'], alpha=0.05) axes[0, 1].set_xlabel('Empirical Locality (Residual)') axes[0, 1].set_ylabel('Bootstraped -log10(p-value)') fold = sum(np.array(go_enrichment['locality_pval']) > 1.3) / ( 0.05 * len(go_enrichment)) axes[0, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[0, 1].text(min(axes[0, 1].get_xlim()), -1 * np.log10(0.05), '{:.3g} Fold Enrichement'.format(fold), color='red') axes[1, 1].scatter(go_enrichment['size'], go_enrichment['locality_pval'], alpha=0.05) axes[1, 1].set_xlabel('Term Size') axes[1, 1].set_ylabel('Bootstrapped -log10(p-value)') axes[1, 1].axhline(y=-1 * np.log10(0.05), color='red') axes[2, 1].scatter(go_enrichment['size'], go_enrichment['locality'], alpha=0.05) axes[2, 1].scatter( go_enrichment.query('locality_pval>1.3')['size'], go_enrichment.query('locality_pval>1.3')['locality'], alpha=0.05, color='r') axes[2, 1].set_ylabel('Density') axes[2, 1].set_xlabel('Term Size') # Save Figure plt.tight_layout() try: plt.savefig('{}_GO.png'.format(args.out)) except FutureWarning as e: pass else: log('Skipping GO Volcano.')
def getNodes( genes, cob, term, primary=None, render=None, gwasData=pd.DataFrame(), nodeCutoff=0, windowSize=None, flankLimit=None, fdrCutoff=None, hpo=False, ): # Cache the locality locality = cob.locality(genes) # Containers for the node info nodes = {} parent_set = set() # Look for alises aliases = co.RefGen(cob._global("parent_refgen")).aliases( [gene.id for gene in genes] ) # Look for annotations if cob._global("parent_refgen") in func_data_db: func_data = func_data_db[cob._global("parent_refgen")].get_annotations( [gene.id for gene in genes] ) else: func_data = {} # Pre cache a list of the contained genes gwasDataGenes = set() if not gwasData.empty: gwasDataGenes = set(gwasData["gene"]) for gene in genes: # Catch for translating the way camoco works to the way We need for COB try: ldegree = locality.ix[gene.id]["local"] gdegree = locality.ix[gene.id]["global"] except KeyError as e: ldegree = gdegree = "nan" # Catch for bug in camoco try: numInterv = str(gene.attr["num_intervening"]) rankIntervening = str(gene.attr["intervening_rank"]) numSiblings = str(gene.attr["num_siblings"]) except KeyError as e: # print('Num Attr fail on gene: ' + str(gene.id)) numInterv = "-" rankIntervening = "-" numSiblings = "-" # Pull any aliases from our database alias = "" if gene.id in aliases: for a in aliases[gene.id]: alias += a + " " # Fetch the FDR if we can fdr = np.nan if gene.id in gwasDataGenes: fdr = gwasData[gwasData["gene"] == gene.id]["fdr"].min() # Pull any annotations from our databases anote = "" if gene.id in func_data: for a in func_data[gene.id]: anote += a + " " # Fetch parent locus if we can if "parent_locus" not in gene.attr: gene.attr["parent_locus"] = "[Unknown]{}:{}-{}".format( gene.chrom, gene.start, gene.end ) # Build the data object from our data node = { "group": "nodes", "data": { "id": gene.id, "type": "gene", "render": False, "term": term, "snp": gene.attr["parent_locus"].replace("<", "[").replace(">", "]"), "alias": alias, "origin": "N/A", "chrom": str(gene.chrom), "start": str(gene.start), "end": str(gene.end), "cur_ldegree": str(0), "ldegree": str(ldegree), "gdegree": str(gdegree), "fdr": "HPO" if hpo else str(fdr), "windowSize": str(windowSize), "flankLimit": str(flankLimit), "numIntervening": numInterv, "rankIntervening": rankIntervening, "numSiblings": numSiblings, # 'parentNumIterations': str(gene.attr['parent_numIterations']), # 'parentAvgEffectSize': str(gene.attr['parent_avgEffectSize']), "annotations": anote, }, } # Denote the query genes if primary: if gene.id in primary: node["data"]["origin"] = "query" else: node["data"]["origin"] = "neighbor" # Denote whether or not to render it if ldegree >= nodeCutoff: if (not fdrCutoff) or gwasData.empty or fdr <= fdrCutoff: if (not render) or (gene.id in render): node["data"]["render"] = True # Save the node to the list nodes[gene.id] = node return nodes
def AtLeaf(self): Leaf = ['GSE14578','GSE5630','GSE13739', #'GSE26199', 'GSE5686','GSE5615','GSE5620','GSE5628','GSE5624','GSE5626','GSE5621','GSE5622', 'GSE5623','GSE5625','GSE5688'] LeafFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Leaf ]) # Generate the LeafKeep file #LeafFam.to_keepfile("LeafKeep.tsv",keep_hint="lea") AtLeaf = co.COB.from_DataFrame(LeafFam.series_matrix(keepfile="raw/GSE/LeafKeep.tsv"),'AtLeaf','Arabidopsis Leaf',co.RefGen('Tair10'),rawtype='MICROARRAY') self.assertIsInstance(AtLeaf,co.COB)
def BuildIonome(self): csv = os.path.join(cf.get('options','testdir'),'raw','sigGWASsnpsCombinedIterations.longhorn.allLoc.csv') ZM = co.RefGen('Zm5bFGS') df = pd.DataFrame.from_csv(csv,index_col=None) IONS = co.Ontology.from_DataFrame(df,'ZmIonome','Maize Ionome',ZM,term_col='el',chr_col='chr',pos_col='pos'); self.assertIsInstance(IONS,co.Ontology)
def snp2gene(args): ''' Perform SNP (locus) to candidate gene mapping ''' if args.out != sys.stdout: # Create any non-existant directories if os.path.dirname(args.out) != '': os.makedirs(os.path.dirname(args.out),exist_ok=True) if os.path.exists(args.out) and not args.force: print( "Output for {} exists! Skipping!".format( args.out ),file=sys.stderr ) return None # Set a flag saying this is from a COB refgen from_cob = False # Create the refgen (option to create it from a COB) if co.Tools.available_datasets('Expr',args.refgen): refgen = co.COB(args.refgen).refgen from_cob = args.refgen elif co.Tools.available_datasets('RefGen',args.refgen): refgen = co.RefGen(args.refgen) # Create the GWAS object ont = co.GWAS(args.gwas) if 'all' in args.terms: terms = ont.iter_terms() else: terms = [ont[term] for term in args.terms] data = pd.DataFrame() results = [] for term in terms: for window_size in args.candidate_window_size: for flank_limit in args.candidate_flank_limit: if 'effective' in args.snp2gene: # Map to effective effective_loci = term.effective_loci( window_size=window_size ) elif 'strongest' in args.snp2gene: effective_loci = term.strongest_loci( window_size=window_size, attr=args.strongest_attr, lowest=args.strongest_higher ) genes = pd.DataFrame([ x.as_dict() for x in refgen.candidate_genes( effective_loci, flank_limit=flank_limit, include_parent_locus=True, include_num_siblings=True, include_num_intervening=True, include_rank_intervening=True, include_SNP_distance=True, include_parent_attrs=args.include_parent_attrs, attrs={'Term':term.id}, ) ]) genes['FlankLimit'] = flank_limit genes['WindowSize'] = window_size genes['RefGen'] = refgen.name if from_cob != False: genes['COB'] = from_cob data = pd.concat([data,genes]) # Add data from gene info files original_number_genes = len(data) for info_file in args.gene_info: log('Adding info for {}',info_file) # Assume the file is a table info = pd.read_table(info_file,sep='\t') if len(info.columns) == 1: info = pd.read_table(info_file,sep=',') # try to match as many columns as possible matching_columns = set(data.columns).intersection(info.columns) log("Joining SNP2Gene mappings with info file on: {}",','.join(matching_columns)) data = pd.merge(data,info,how='left') if len(data) != original_number_genes: log.warn( 'There were multiple info rows for some genes. ' 'Beware of potential duplicate candidate gene entries! ' ) # Generate the output file data.to_csv(args.out,index=None,sep='\t') log("Summary stats") print('-'*100) #print('With {}kb windows and up to {} flanking genes'.format(int(args.candidate_window_size/1000),args.candidate_flank_limit)) print("Mapped {} SNPs to {} genes".format(len(data.parent_locus.unique()),len(data.ID.unique()))) print("Number of candidate genes per term:") print(data.groupby('Term').apply(lambda df: len(df.ID)))
#!/usr/bin/python3 #packages used to build the three networks import camoco as co import pandas as pd import numpy as np import scipy as sp import matplotlib.pyplot as plt #Brachipodium Ref Bd21 = co.RefGen("Bd21") co.RefGen.from_gff("BdistachyonBd21-3v1.1.gene.gff3" ,"Bd21", "BdistachyonBd21-3v1.1", "BdistachyonBd21-3v1.1", "Brachipodium Ref") #Brachipodium GO propogation co.GOnt.from_obo('go.obo', 'Brach.gene.GO.txt', 'Bd21GO', 'Brachi Gene Ontology', Bd21) #Build Brachy network co.COB.from_table('Bd21_Meesh_V3.csv', 'Bd21_Treated', 'Bd21_Treated Samples with mock on day two', Bd21, rawtype='RNASEQ', max_gene_missing_data=0.4, max_accession_missing_data=0.4, min_single_sample_expr=1, min_expr=0.001,
def AtGen(self): General = ['GSE18975','GSE39384','GSE19271','GSE5632','GSE39385','GSE5630','GSE15617','GSE5617','GSE5686','GSE2473', 'GSE5633','GSE5620','GSE5628','GSE5624','GSE5626','GSE5621','GSE5622','GSE5623','GSE5625','GSE5688'] GenFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in General ]) #GenFam.to_keepfile("GenKeep.tsv") AtGen = co.COB.from_DataFrame(GenFam.series_matrix(keepfile="raw/GSE/GenKeep.tsv"),'AtGen','Arab General',co.RefGen('Tair10'),rawtype='MICROARRAY')
def AtRoot(self): Root = ['GSE14578','GSE46205','GSE7631','GSE10576','GSE42007','GSE34130','GSE21611','GSE22966','GSE7641','GSE5620', 'GSE8934','GSE5628','GSE30095','GSE30097','GSE5624','GSE5626','GSE5749','GSE5621','GSE5622','GSE5623','GSE5625','GSE5688'] RootFam = sum([co.Family.from_file("raw/GSE/{}_family.soft".format(x)) for x in Root ]) #RootFam.to_keepfile("RootKeep.tsv",keep_hint='root') AtRoot = co.COB.from_DataFrame(RootFam.series_matrix(keepfile="raw/GSE/RootKeep.tsv"),'AtRoot','Arab Root',co.RefGen('Tair10'),rawtype='MICROARRAY')
def cob_health(args): log = coblog() log( f"\n" f"-----------------------------\n" f" Network Health:{args.cob} \n" f"-----------------------------\n" ) log(f"\nCreating reports in {os.getcwd()}\n\n") cob = co.COB(args.cob) if args.out is None: args.out = "{}_Health".format(cob.name) log(f"Output prefix: {args.out}") if args.edge_zscore_cutoff is not None: log("Changing Z-Score cutoff to {}", args.edge_zscore_cutoff) cob.set_sig_edge_zscore(args.edge_zscore_cutoff) log("Printing Summary ---------------------------------------------------") if not path.exists("{}.summary.txt".format(args.out)): with open("{}.summary.txt".format(args.out), "w") as OUT: # Print out the network summary cob.summary(file=OUT) else: log("Skipped summary.") log("Plotting Scores ----------------------------------------------------") if not path.exists("{}_CoexPCC_raw.png".format(args.out)): cob.plot_scores("{}_CoexPCC_raw.png".format(args.out), pcc=True) else: log("Skipped Raw.") if not path.exists("{}_CoexScore_zscore.png".format(args.out)): cob.plot_scores("{}_CoexScore_zscore.png".format(args.out), pcc=False) else: log("Skipped Norm.") log("Plotting Expression ------------------------------------------------") # if not path.exists('{}_Expr_raw.png'.format(args.out)): # cob.plot( # '{}_Expr_raw.png'.format(args.out), # include_accession_labels=True, # raw=True, # cluster_method=None # ) # else: # log('Skipped raw.') if not path.exists("{}_Expr_norm.png".format(args.out)): cob.plot_heatmap( "{}_Expr_norm.png".format(args.out), include_accession_labels=True, raw=False, cluster_method="ward", cluster_accessions=True, ) else: log("Skipped norm.") # log('Plotting Cluster Expression-----------------------------------------') # if not path.exists('{}_Expr_cluster.png'.format(args.out)): # cob.plot( # '{}_Expr_cluster.png'.format(args.out), # include_accession_labels=True, # raw=False, # cluster_accessions=True, # avg_by_cluster=True # ) # else: # log('Skipped norm.') log("Printing QC Statistics ---------------------------------------------") if args.refgen is not None: if not path.exists("{}_qc_gene.txt".format(args.out)): # Print out the breakdown of QC Values refgen = co.RefGen(args.refgen) gene_qc = cob._bcolz("qc_gene") gene_qc = gene_qc[gene_qc.pass_membership] gene_qc["chrom"] = ["chr" + str(refgen[x].chrom) for x in gene_qc.index] gene_qc = gene_qc.groupby("chrom").agg(sum, axis=0) # Add totals at the bottom totals = gene_qc.ix[:, slice(1, None)].apply(sum) totals.name = "TOTAL" gene_qc = gene_qc.append(totals) gene_qc.to_csv("{}_qc_gene.txt".format(args.out), sep="\t") else: log("Skipped QC summary.") log("Plotting Degree Distribution ---------------------------------------") if not path.exists("{}_DegreeDist.png".format(args.out)): degree = cob.degree["Degree"].values # Using powerlaw makes run-time warning the first time you use it. # This is still an open issue on the creators github. # The creator recommends removing this warning as long as there is a fit. np.seterr(divide="ignore", invalid="ignore") fit = powerlaw.Fit(degree, discrete=True, xmin=1) # get an axis ax = plt.subplot() # Calculate log ratios t2p = fit.distribution_compare("truncated_power_law", "power_law") t2e = fit.distribution_compare("truncated_power_law", "exponential") p2e = fit.distribution_compare("power_law", "exponential") # Plot! emp = fit.plot_ccdf(ax=ax, color="r", linewidth=3, label="Empirical Data") pwr = fit.power_law.plot_ccdf( ax=ax, linewidth=2, color="b", linestyle=":", label="Power law" ) tpw = fit.truncated_power_law.plot_ccdf( ax=ax, linewidth=2, color="k", linestyle="-.", label="Truncated Power" ) exp = fit.exponential.plot_ccdf( ax=ax, linewidth=2, color="g", linestyle="--", label="Exponential" ) #### ax.set_ylabel("p(Degreeā„x)") ax.set_xlabel("Degree Frequency") ax.legend(loc="best") plt.title("{} Degree Distribution".format(cob.name)) # Save Fig try: plt.savefig("{}_DegreeDist.png".format(args.out)) except FutureWarning as e: # This is a matplotlib bug pass else: log("Skipping Degree Dist.") if args.go is not None: log("Plotting GO --------------------------------------------------------") # Set the alpha based on the tails if args.two_tailed == True: alpha = 0.05 / 2 else: alpha = 0.05 # Generate the GO Table if not path.exists("{}_GO.csv".format(args.out)): go = co.GOnt(args.go) term_ids = [] density_emp = [] density_pvals = [] locality_emp = [] locality_pvals = [] term_sizes = [] term_desc = [] terms_tested = 0 # max_terms limits the number of GO terms tested (sub-sampling) if args.max_terms is not None: log("Limiting to {} GO Terms", args.max_terms) terms = go.rand( n=args.max_terms, min_term_size=args.min_term_size, max_term_size=args.max_term_size, ) else: # Else do the whole set (default is terms between 10 and 300 genes) terms = go.iter_terms( min_term_size=args.min_term_size, max_term_size=args.max_term_size ) for term in terms: # Some terms will lose genes that are not in networks term.loci = list(filter(lambda x: x in cob, term.loci)) # Skip terms that are not an adequate size if len(term) < args.min_term_size or len(term) > args.max_term_size: continue # set density value for two tailed go so we only test it once density = cob.density(term.loci) # one tailed vs two tailed test density_emp.append(density) # term_ids.append(term.id) term_sizes.append(len(term)) term_desc.append(str(term.desc)) # ------ Density # Calculate PVals density_bs = np.array( [ cob.density(cob.refgen.random_genes(n=len(term.loci))) for x in range(args.num_bootstraps) ] ) if density > 0: pval = sum(density_bs >= density) / args.num_bootstraps else: pval = sum(density_bs <= density) / args.num_bootstraps density_pvals.append(pval) # ------- Locality locality = cob.locality(term.loci, include_regression=True).resid.mean() locality_emp.append(locality) # Calculate PVals locality_bs = np.array( [ cob.locality( cob.refgen.random_genes(n=len(term.loci)), include_regression=True, ).resid.mean() for x in range(args.num_bootstraps) ] ) if locality > 0: pval = sum(locality_bs >= locality) / args.num_bootstraps else: pval = sum(locality_bs <= locality) / args.num_bootstraps locality_pvals.append(pval) # ------------- terms_tested += 1 if terms_tested % 100 == 0 and terms_tested > 0: log("Processed {} terms".format(terms_tested)) go_enrichment = pd.DataFrame( { "GOTerm": term_ids, "desc": term_desc, "size": term_sizes, "density": density_emp, "density_pval": density_pvals, "locality": locality_emp, "locality_pval": locality_pvals, } ) # Calculate significance go_enrichment["density_significant"] = go_enrichment.density_pval < alpha go_enrichment["locality_significant"] = go_enrichment.locality_pval < alpha # Calculate bonferonni go_enrichment["density_bonferroni"] = go_enrichment.density_pval < ( alpha / len(go_enrichment) ) go_enrichment["locality_bonferroni"] = go_enrichment.locality_pval < ( alpha / len(go_enrichment) ) # Store the GO results in a CSV go_enrichment.sort_values(by="density_pval", ascending=True).to_csv( "{}_GO.csv".format(args.out), index=False ) if terms_tested == 0: log.warn("No GO terms met your min/max gene criteria!") else: go_enrichment = pd.read_table("{}_GO.csv".format(args.out), sep=",") if not path.exists("{}_GO.png".format(args.out)): # Convert pvals to log10 with np.errstate(divide="ignore"): # When no bootstraps are more extreme than the term, the minus log pval yields an infinite go_enrichment["density_pval"] = -1 * np.log10( go_enrichment["density_pval"] ) go_enrichment["locality_pval"] = -1 * np.log10( go_enrichment["locality_pval"] ) # Fix the infinites so they are plotted max_density = np.max( go_enrichment["density_pval"][ np.isfinite(go_enrichment["density_pval"]) ] ) max_locality = np.max( go_enrichment["locality_pval"][ np.isfinite(go_enrichment["locality_pval"]) ] ) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment["density_pval"])), "density_pval", ] = (max_density + 1) go_enrichment.loc[ np.logical_not(np.isfinite(go_enrichment["locality_pval"])), "locality_pval", ] = (max_locality + 1) plt.clf() # Calculate the transparency based on the number of terms if len(go_enrichment) > 20: transparency_alpha = 0.05 else: transparency_alpha = 1 # -------------------------------------------------------------------- # Start Plotting figure, axes = plt.subplots(3, 2, figsize=(12, 12)) # ----------- # Density # ---------- log_alpha = -1 * np.log10(alpha) axes[0, 0].scatter( go_enrichment["density"], go_enrichment["density_pval"], alpha=transparency_alpha, color="blue", ) axes[0, 0].set_xlabel("Empirical Density (Z-Score)") axes[0, 0].set_ylabel("Bootstraped -log10(p-value)") fold = sum(np.array(go_enrichment["density_pval"]) > log_alpha) / ( alpha * len(go_enrichment) ) axes[0, 0].axhline(y=-1 * np.log10(0.05), color="red") axes[0, 0].text( min(axes[0, 0].get_xlim()), -1 * np.log10(alpha) + 0.1, "{:.3g} Fold Enrichement".format(fold), color="red", ) axes[0, 0].set_title("Density Health") # Plot pvalue by term size axes[1, 0].scatter( go_enrichment["size"], go_enrichment["density_pval"], alpha=transparency_alpha, color="blue", ) axes[1, 0].set_ylabel("Bootstrapped -log10(p-value)") axes[1, 0].set_xlabel("Term Size") axes[1, 0].axhline(y=-1 * np.log10(alpha), color="red") axes[2, 0].scatter( go_enrichment["size"], go_enrichment["density"], alpha=transparency_alpha, color="blue", ) # Plot raw density by term size axes[2, 0].scatter( go_enrichment.query(f"density_pval>{log_alpha}")["size"], go_enrichment.query(f"density_pval>{log_alpha}")["density"], alpha=transparency_alpha, color="r", ) axes[2, 0].set_ylabel("Density") axes[2, 0].set_xlabel("Term Size") # ------------ # Do Locality # ------------ axes[0, 1].scatter( go_enrichment["locality"], go_enrichment["locality_pval"], alpha=transparency_alpha, color="blue", ) axes[0, 1].set_xlabel("Empirical Locality (Residual)") axes[0, 1].set_ylabel("Bootstraped -log10(p-value)") fold = sum(np.array(go_enrichment["locality_pval"]) > log_alpha) / ( 0.05 * len(go_enrichment) ) axes[0, 1].axhline(y=-1 * np.log10(0.05), color="red") axes[0, 1].text( min(axes[0, 1].get_xlim()), -1 * np.log10(alpha), "{:.3g} Fold Enrichement".format(fold), color="red", ) axes[0, 1].set_title("Locality Health") axes[1, 1].scatter( go_enrichment["size"], go_enrichment["locality_pval"], alpha=transparency_alpha, color="blue", ) axes[1, 1].set_xlabel("Term Size") axes[1, 1].set_ylabel("Bootstrapped -log10(p-value)") axes[1, 1].axhline(y=-1 * np.log10(0.05), color="red") axes[2, 1].scatter( go_enrichment["size"], go_enrichment["locality"], alpha=transparency_alpha, color="blue", ) axes[2, 1].scatter( go_enrichment.query(f"locality_pval>{log_alpha}")["size"], go_enrichment.query(f"locality_pval>{log_alpha}")["locality"], alpha=transparency_alpha, color="r", ) axes[2, 1].set_ylabel("Locality") axes[2, 1].set_xlabel("Term Size") # Save Figure plt.tight_layout() try: plt.savefig("{}_GO.png".format(args.out)) except FutureWarning as e: pass else: log("Skipping GO Volcano.")