def transcriptome_filter(poisson_cutoff, transcriptome_size, transcriptome_reads, cluster): """ filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes transcriptome cutoff, false if not poisson_cutoff - float,user set cutoff transcriptome_size - int number of genes in transcriptome transcritpmoe_reads - int total number of reads analized cluster - named tuple , namedtuple('Peak', ['chrom', 'genomic_start', 'genomic_stop', 'gene_name', 'super_local_poisson_p', 'strand', 'thick_start', 'thick_stop', 'peak_number', 'number_reads_in_peak', 'gene_poisson_p', 'size' 'p' ]) """ transcriptome_p = poissonP(transcriptome_reads, cluster.number_reads_in_peak, transcriptome_size, cluster.size) if math.isnan(transcriptome_p): logging.info("""Transcriptome P is NaN, transcriptome_reads = %d, cluster reads = %d, transcriptome_size = %d, cluster_size = %d""" % (transcriptome_reads, cluster.number_reads_in_peak, transcriptome_size, cluster.size)) return np.Inf return transcriptome_p
def transcriptome_filter(poisson_cutoff, transcriptome_size, transcriptome_reads, cluster): """ filters each cluster by if it passes a transciptome wide cutoff or not, returns true if it passes transcriptome cutoff, false if not poisson_cutoff - float,user set cutoff transcriptome_size - int number of genes in transcriptome transcritpmoe_reads - int total number of reads analized cluster - dict, stats about the cluster we are analizing {'Nreads' : int, 'size' : int} """ transcriptome_p = poissonP(transcriptome_reads, cluster['Nreads'], transcriptome_size, cluster['size']) if math.isnan(transcriptome_p): verboseprint("""Transcriptome P is NaN, transcriptome_reads = %d, cluster reads = %d, transcriptome_size = %d, cluster_size = %d""" % (transcriptome_reads, cluster['Nreads'], transcriptome_size, cluster['size'])) return False if transcriptome_p > poisson_cutoff: print """%s\n Failed Transcriptome cutoff with %s reads, pval: %s""" % (cluster, cluster['Nreads'], transcriptome_p) return False return True
def superlocal_poissonP(cluster): return poissonP(cluster.area_reads, cluster.number_reads_in_peak, cluster.area_size, cluster['size'])
def transcript_poissonP(cluster): return poissonP(cluster.nreads_in_gene, cluster.number_reads_in_peak, cluster.effective_length, cluster['size'])
def transcriptome_poissonP(cluster): return poissonP(cluster.transcriptome_reads, cluster.number_reads_in_peak, cluster.transcriptome_size, cluster['size'])
def main(options): if options.np == 'autodetect': options.np = multiprocessing.cpu_count() pool = multiprocessing.Pool(int(options.np)) #job_server = pp.Server(ncpus=options.np) #old pp stuff bamfile = options.bam if os.path.exists(bamfile): #re-set to include the full path to bamfile bamfile = os.path.abspath(bamfile) verboseprint("bam file is set to %s\n" % (bamfile)) else: sys.stderr.write("Bam file not defined") raise IOError genes, lengths = build_transcript_data(options.species, options.geneBEDfile, options.geneMRNAfile, options.genePREMRNAfile, options.premRNA) margin = int(options.margin) #this should be fixed, args should initally be ints if passed if options.maxgenes is not None: maxgenes = int(options.maxgenes) minreads = int(options.minreads) poisson_cutoff = options.poisson_cutoff #gets all the genes to call peaks on if options.gene is not None and len(options.gene ) > 0: gene_list = options.gene else: #selects all genes gene_list = genes.keys() results = [] #Set up peak calling by gene running_list = [genes[gene] for gene in gene_list] length_list = [lengths[gene] for gene in gene_list] #truncates for max genes if options.maxgenes is not None: running_list = running_list[:maxgenes] length_list = length_list[:maxgenes] transcriptome_size = sum(length_list) #do the parralization tasks = [(gene, length, None, bamfile, margin, options.FDR_alpha, options.threshold, minreads, poisson_cutoff, options.plotit, 10, 1000, options.SloP, False) for gene, length in zip(running_list, length_list)] #jobs = [] #for job in tasks: #func_star(job) #growth = objgraph.show_growth(limit=10) #if growth is not None: # print job # print objgraph.show_growth(limit=10) #jobs.append(func_star(job)) #sets chunk size to be a fair bit smaller, than total input, but not #to small chunk_size = len(tasks) // int(options.np) * 10 if chunk_size < 1: chunk_size = 1 jobs = pool.map(func_star, tasks, chunksize=chunk_size) for job in jobs: results.append(job) verboseprint("finished with calling peaks") #if we are going to save and output as a pickle file we should #output as a pickle file we should factor instead create a method #or object to handle all file output if options.save_pickle is True: pickle_file = open(options.outfile + ".pickle", 'w') pickle.dump(results, file=pickle_file) #combine results allpeaks = set([]) #count total number of reads in transcriptiome transcriptome_reads = 0 for gene_result in results: if gene_result is not None: verboseprint("nreads", gene_result['nreads']) transcriptome_reads += gene_result['nreads'] print """Transcriptome size is %d, transcriptome reads are %d""" % (transcriptome_size, transcriptome_reads) #is this a missed indent? for gener in results: if gener['clusters'] is None: print >> sys.stderr, gener, "no clusters" continue for cluster in gener['clusters'].keys(): try: transcriptome_p = poissonP(transcriptome_reads, gener['clusters'][cluster]['Nreads'], transcriptome_size, gener['clusters'][cluster]['size']) if math.isnan(transcriptome_p): print """Transcriptome P is NaN, transcriptome_reads = %d, cluster reads = %d, transcriptome_size = %d, cluster_size = %d""" % (transcriptome_reads, gener['clusters'][cluster]['Nreads'], transcriptome_size, gener['clusters'][cluster]['size']) continue if transcriptome_p > poisson_cutoff: print """%s\n Failed Transcriptome cutoff with %s reads, pval: %s""" % (cluster, gener['clusters'][cluster]['Nreads'], transcriptome_p) continue min_pval = 1 corrected_SloP_pval = gener['clusters'][cluster]['SloP'] corrected_gene_pval = gener['clusters'][cluster]['GeneP'] if (corrected_SloP_pval < poisson_cutoff or corrected_gene_pval < poisson_cutoff): min_pval = min([corrected_SloP_pval, corrected_gene_pval]) else: verboseprint("Failed Gene Pvalue: %s and failed SloP Pvalue: %s for cluster %s" % (corrected_gene_pval, corrected_SloP_pval, cluster)) continue (chrom, g_start, g_stop, peak_name, geneP, signstrand, thick_start, thick_stop) = cluster.split("\t") #print >> sys.stderr, cluster bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (chrom, int(g_start), int(g_stop), peak_name, min_pval, signstrand, int(thick_start), int(thick_stop)) allpeaks.add(bedline) except NameError as error: print >> sys.stderr, error print >> sys.stderr, "parsing failed" raise error #again redundant code outbed = options.outfile + ".BED" color = options.color pybedtools.BedTool("\n".join(allpeaks), from_string=True).sort(stream=True).saveas(outbed, trackline="track name=\"%s\" visibility=2 colorByStrand=\"%s %s\"" % (outbed, color, color)) print "wrote peaks to %s" % (options.outfile) "\n".join(allpeaks) return 1