def __init__(self, fl_min, fl_max, fl_density, stats=None): assert fl_min <= fl_max assert fl_max < 100000 assert abs(sum(fl_density) - 1.) < 1e-6 try: assert type(fl_min) == int assert type(fl_max) == int assert len(fl_density) == (fl_max - fl_min + 1) except: config.log_statement(fl_density) config.log_statement((fl_min, fl_max)) raise self.fl_min = fl_min self.fl_max = fl_max self.fl_density = fl_density self.fl_density_cumsum = fl_density.cumsum() # cumsum weighted by the fragment length, just a caching optimization self.fl_density_weighted_cumsum = \ self.fl_density*numpy.arange( fl_min, fl_max+1 ) self.fl_density_weighted_cumsum = self.fl_density_weighted_cumsum.cumsum( ) # build and set the hash value self._hash_value = hash( ( self.fl_min, self.fl_max, \ tuple( self.fl_density ) ) ) self.stats = stats
def _build_rnaseq_arrays(self, gene, rnaseq_reads, fl_dists): # bin the rnaseq reads expected_rnaseq_cnts, observed_rnaseq_cnts = \ build_expected_and_observed_rnaseq_counts( gene, rnaseq_reads, fl_dists ) clustered_bins = cluster_bins(expected_rnaseq_cnts) #for cluster in clustered_bins: # print cluster #print # if no transcripts are observable given the fl dist, then return nothing if len(expected_rnaseq_cnts) == 0: self.array_types.append('RNASeq') self.obs_cnt_arrays.append(None) self.expected_freq_arrays.append(None) return # build the expected and observed counts, and convert them to frequencies ( expected_rnaseq_array, observed_rnaseq_array, unobservable_rnaseq_trans ) = \ build_expected_and_observed_arrays( expected_rnaseq_cnts, observed_rnaseq_cnts, normalize=True ) del expected_rnaseq_cnts, observed_rnaseq_cnts if config.DEBUG_VERBOSE: config.log_statement("Clustering bins in RNAseq array") expected_rnaseq_array, observed_rnaseq_array, clusters = cluster_rows( expected_rnaseq_array, observed_rnaseq_array) self.array_types.append('RNASeq') self.obs_cnt_arrays.append(observed_rnaseq_array) self.expected_freq_arrays.append(expected_rnaseq_array) self.unobservable_transcripts.update(unobservable_rnaseq_trans)
def get_new_gene(): # get a gene to process try: gene_id = gene_ids.get(timeout=0.1) except Queue.Empty: assert gene_ids.qsize() == 0 raise IndexError, "No genes left" config.log_statement("Loading design matrix for gene '%s'" % gene_id) gene = data.get_gene(gene_id) try: f_mat = data.get_design_matrix(gene_id) except NoDesignMatrixError: if config.DEBUG_VERBOSE: config.log_statement("No design matrix for '%s'" % gene_id, log=True) raise mle_estimate = data.get_mle(gene_id) trans_indices = [] for row_num, t_index in enumerate(f_mat.transcript_indices()): trans_indices.append((t_index, row_num + 1, bnd_type)) cntr = trans_index_cntrs[gene_id] with cntr.get_lock(): if cntr.value == -1000: cntr.value = len(trans_indices) - 1 return gene, f_mat, mle_estimate, trans_indices, cntr
def set_design_matrix(self, gene_id, f_mat): ofname = config.get_fmat_tmp_fname(gene_id, SAMPLE_ID, REP_ID) # because there's no cache invalidation mechanism, we're only # allowed to set the f_mat object once. This also allows us to # move the load outside of the lock try: assert self.design_mat_filenames[gene_id].value == '' except: config.log_statement( "%s has already had its design matrix set (%s)" % ( gene_id, self.design_mat_filenames[gene_id].value ), log=True) return with open(ofname, "w") as ofp: pickle.dump(f_mat, ofp) with self.design_mat_lock: self.design_mat_filenames[gene_id].value = ofname if f_mat.num_rnaseq_reads != None: with self.num_rnaseq_reads.get_lock(): self.num_rnaseq_reads.value += f_mat.num_rnaseq_reads if f_mat.num_fp_reads != None: with self.num_cage_reads.get_lock(): self.num_cage_reads.value += f_mat.num_fp_reads if f_mat.num_tp_reads != None: with self.num_polya_reads.get_lock(): self.num_polya_reads.value += f_mat.num_tp_reads return
def get_RNAseq_densities( all_reads, polyAs ): ''' get the local RNA-seq read densities ''' dense = dict() header = [] for sample in (x.filename for x in all_reads): header.extend( [ sample + '_up_10_rd1', sample + 'down_10_rd1', sample + '_up_50_rd1', sample + 'down_50_rd1', sample + '_up_100_rd1', sample + 'down_100_rd1', sample + '_up_down_rat_10_rd1', sample + '_up_down_rat_50_rd1', sample + '_up_down_rat_100_rd1' ] ) header.extend( [ sample + '_up_10_rd2', sample + 'down_10_rd2', sample + '_up_50_rd2', sample + 'down_50_rd2', sample + '_up_100_rd2', sample + 'down_100_rd2', sample + '_up_down_rat_10_rd2', sample + '_up_down_rat_50_rd2', sample + '_up_down_rat_100_rd2' ] ) header.extend( [ sample + '_up_down_rat_10_rd1_rd2', sample + '_up_down_rat_50_rd1_rd2', sample + '_up_down_rat_100_rd1_rd2' ] ) # process a list of arguments for multithreading import multiprocessing manager = multiprocessing.Manager() dense = manager.dict() sites = manager.list() sites_lock = manager.Lock() for reads in all_reads: for (chrm, strand), polyA in polyAs.iteritems(): chrm = clean_chr_name( chrm ) for pos, cnt in sorted(polyA.iteritems()): sites.append( (chrm, strand, pos, cnt) ) if VERBOSE: config.log_statement( "Finding RNASeq read coverage around poly(A) sites with %i threads"\ % NTHREADS) if NTHREADS == 1: get_RNAseq_density_worker( reads, sites, sites_lock, dense ) else: from lib.multiprocessing_utils import Pool all_args = [( reads, sites, sites_lock, dense )]*NTHREADS p = Pool(NTHREADS) p.apply( get_RNAseq_density_worker, all_args ) if VERBOSE: config.log_statement("FINISHED finding poly(A) coverage") return dict(dense), header
def iter_good_exons(): num = 0 for (chrm, strand), exons in sorted( elements.iteritems()): for start,stop in iter_nonoverlapping_exons(exons): num += 1 yield GenomicInterval(chrm, strand, start, stop) if config.DEBUG_VERBOSE: config.log_statement("FL ESTIMATION: %s %s" % ((chrm, strand), num )) return
def find_confidence_bounds_in_gene(gene, num_reads_in_bams, f_mat, mle_estimate, trans_indices, cntr, cb_alpha): # update the mle_estimate array to only store observable transcripts # add 1 to skip the out of gene bin observable_trans_indices = (numpy.array([ -1, ] + f_mat.transcript_indices().tolist()) + 1) mle_estimate = mle_estimate[observable_trans_indices] if config.VERBOSE: config.log_statement("Estimating confidence bounds for gene %s" % gene.id) #n_skipped = sum( 1 for x in sorted(f_mat.filtered_transcripts) # if x < trans_indices[0]) # XXX Make sure that this is beign counted correctly #n_skipped_tmp = len(set(xrange(trans_indices[0])) - \ # set(x-1 for x in observable_trans_indices[1:] if x-1 < trans_indices[0])) #config.log_statement( str([n_skipped_tmp, n_skipped, f_mat.filtered_transcripts, \ # observable_trans_indices, trans_indices]), log=True) #assert n_skipped == n_skipped_tmp res = [] while True: with cntr.get_lock(): index = cntr.value if index == -1: break cntr.value -= 1 trans_index, exp_mat_row, bnd_type = trans_indices[index] config.log_statement( "Estimating %s confidence bound for gene %s (%i/%i remain)" % (bnd_type, gene.id, cntr.value + 1, len(gene.transcripts))) try: p_value, bnd = frequency_estimation.estimate_confidence_bound( f_mat, num_reads_in_bams, exp_mat_row, mle_estimate, bnd_type, cb_alpha) except Exception, inst: p_value = 1. bnd = 0.0 if bnd_type == 'lb' else 1.0 error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % (os.getpid( ), gene.id, gene.chrm, gene.strand, gene.start, gene.stop, inst) config.log_statement(error_msg, log=True) config.log_statement(traceback.format_exc(), log=True) if config.DEBUG_VERBOSE: config.log_statement( "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" % (bnd_type, gene.id, None, trans_index, len( gene.transcripts), bnd, p_value)) res.append((bnd_type, trans_index, bnd))
def cluster_bins(expected_rnaseq_cnts): if config.DEBUG_VERBOSE: config.log_statement("Normalizing bin frequencies") clustered_bins = defaultdict(list) for bin, transcripts_and_cnts in expected_rnaseq_cnts.items(): row = numpy.array( [x[1] for x in sorted(transcripts_and_cnts.iteritems())]) key = tuple((100000 * row / row.sum()).round().tolist()) clustered_bins[key].append(bin) return clustered_bins.values()
def load_gene_bndry_bins(genes, contig, strand, contig_len): if config.VERBOSE: config.log_statement( "Loading gene boundaries from annotated genes in %s:%s" % (contig, strand)) regions_graph = nx.Graph() for gene in genes: if gene.chrm != contig: continue if gene.strand != strand: continue regions = [tuple(x) for x in gene.find_transcribed_regions()] regions_graph.add_nodes_from(regions) regions_graph.add_edges_from(izip(regions[:-1], regions[1:])) # group overlapping regions all_regions = sorted(regions_graph.nodes()) if len(all_regions) == 0: return [] grpd_regions = [ [], ] curr_start, curr_stop = all_regions[0] for x in all_regions: if x[0] < curr_stop: curr_stop = max(x[1], curr_stop) grpd_regions[-1].append(x) else: curr_start, curr_stop = x grpd_regions.append([ x, ]) # add edges for overlapping regions for grp in grpd_regions: regions_graph.add_edges_from(izip(grp[:-1], grp[1:])) # build gene objects with the intervals gene_bndry_bins = [] for regions_cluster in nx.connected_components(regions_graph): gene_bin = GeneElements(contig, strand) regions = sorted(files.gtf.flatten(regions_cluster)) for start, stop in regions: gene_bin.regions.append( SegmentBin(start, stop, [ "ESTART", ], [ "ESTOP", ], "GENE")) gene_bndry_bins.append(gene_bin) # XXX TODO expand gene boundaries # actually, it's probably better just to go through discovery return gene_bndry_bins
def add_elements_for_contig_and_strand_worker(args_queue, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes): while True: args = args_queue.get() if args == 'FINISHED': config.log_statement("") return (contig, strand), grpd_exons = args add_elements_for_contig_and_strand( (contig, strand), grpd_exons, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes)
def build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes): # if appropriate, open the fasta file if fasta_fp != None: fasta = Fastafile(fasta_fp.name) else: fasta = None while True: config.log_statement("Waiting for gene to process. (%i)" % elements.qsize()) gene_elements = elements.get() if gene_elements == 'FINISHED': config.log_statement("") return build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp, fasta, ref_genes) return
def calc_max_feasible_step_size_and_limiting_index_BAD(x0, gradient): #Calculate the maximum step size to stay in the feasible region. # #solve y - x*gradient = MIN_TRANSCRIPT_FREQ for x #x = (y - MIN_TRANSCRIPT_FREQ)/gradient # # we use minus because we return a positive step try: steps = (x0 - MIN_TRANSCRIPT_FREQ) / (gradient + 1e-12) step_size = -steps[steps < 0].max() step_size_i = (steps == -step_size).nonzero()[0] except: config.log_statement("steps=" + steps) raise return step_size, step_size_i
def add_elements_for_contig_and_strand_worker( args_queue, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes): while True: args = args_queue.get() if args == 'FINISHED': config.log_statement("") return (contig, strand), grpd_exons = args add_elements_for_contig_and_strand( (contig, strand), grpd_exons, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes)
def build_transcripts_worker( elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ): # if appropriate, open the fasta file if fasta_fp != None: fasta = Fastafile(fasta_fp.name) else: fasta = None while True: config.log_statement("Waiting for gene to process. (%i)" % elements.qsize()) gene_elements = elements.get() if gene_elements == 'FINISHED': config.log_statement("") return build_and_write_gene( gene_elements, output, gtf_ofp, tracking_ofp, fasta, ref_genes) return
def feed_elements(raw_elements, elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ): all_args = multiprocessing.Queue() for (contig, strand), grpd_exons in raw_elements.iteritems(): all_args.put([(contig, strand), dict(grpd_exons)]) for i in xrange(config.NTHREADS): all_args.put('FINISHED') num_add_element_threads = min(len(raw_elements), config.NTHREADS) gene_id_cntr = multiprocessing.Value('i', 0) nthreads_remaining = multiprocessing.Value('i', num_add_element_threads) worker_args = [ all_args, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ] cluster_pids = [] for i in xrange(num_add_element_threads): pid = os.fork() if pid == 0: add_elements_for_contig_and_strand_worker(*worker_args) with nthreads_remaining.get_lock(): nthreads_remaining.value -= 1 config.log_statement("Finished adding elements (%i left)" % nthreads_remaining.value) build_transcripts_worker( elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ) os._exit(0) cluster_pids.append(pid) while True: with nthreads_remaining.get_lock(): if nthreads_remaining.value == 0: for i in xrange(config.NTHREADS+1): elements.put('FINISHED') break time.sleep(1.0) for pid in cluster_pids: os.waitpid(pid, 0) config.log_statement("Finished adding elements") return
def build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp, fasta, ref_genes ): # build the gene with transcripts, and optionally call orfs start = min(x[0] for x in chain( gene_elements.tss_exons, gene_elements.tes_exons, gene_elements.promoter, gene_elements.polyas)) stop = max(x[1] for x in chain( gene_elements.tss_exons, gene_elements.tes_exons, gene_elements.promoter, gene_elements.polyas)) try: config.log_statement( "Building transcripts and ORFs for %s (%s:%s:%i-%i)" % ( gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop) ) gene = build_gene(gene_elements, fasta, ref_genes) if gene == None: return config.log_statement( "FINISHED Building transcript and ORFs for Gene %s" % gene.id) # dump a pickle of the gene to a temp file, and set that in the # output manager ofname = gene.write_to_file( config.get_gene_tmp_fname(gene.id, SAMPLE_TYPE, REP_ID)) output.put((gene.id, len(gene.transcripts), ofname)) write_gene_to_gtf(gtf_ofp, gene) write_gene_to_tracking_file(tracking_ofp, gene) except TooManyCandidateTranscriptsError: config.log_statement( "Too many candidate transcripts in %s(%s:%s:%i-%i)" % ( gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop), log=True) return except Exception, inst: config.log_statement( "ERROR building transcript in %s(%s:%s:%i-%i): %s" % ( gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop, inst), log=True) if config.DEBUG_VERBOSE: config.log_statement( traceback.format_exc(), log=True )
def load_gene_bndry_bins( genes, contig, strand, contig_len ): if config.VERBOSE: config.log_statement( "Loading gene boundaries from annotated genes in %s:%s" % ( contig, strand) ) regions_graph = nx.Graph() for gene in genes: if gene.chrm != contig: continue if gene.strand != strand: continue regions = [tuple(x) for x in gene.find_transcribed_regions()] regions_graph.add_nodes_from(regions) regions_graph.add_edges_from(izip(regions[:-1], regions[1:])) # group overlapping regions all_regions = sorted(regions_graph.nodes()) if len(all_regions) == 0: return [] grpd_regions = [[],] curr_start, curr_stop = all_regions[0] for x in all_regions: if x[0] < curr_stop: curr_stop = max(x[1], curr_stop) grpd_regions[-1].append(x) else: curr_start, curr_stop = x grpd_regions.append([x,]) # add edges for overlapping regions for grp in grpd_regions: regions_graph.add_edges_from(izip(grp[:-1], grp[1:])) # build gene objects with the intervals gene_bndry_bins = [] for regions_cluster in nx.connected_components(regions_graph): gene_bin = GeneElements( contig, strand ) regions = sorted(files.gtf.flatten(regions_cluster)) for start, stop in regions: gene_bin.regions.append( SegmentBin(start, stop, ["ESTART",], ["ESTOP",], "GENE")) gene_bndry_bins.append( gene_bin ) # XXX TODO expand gene boundaries # actually, it's probably better just to go through discovery return gene_bndry_bins
def build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp, fasta, ref_genes): # build the gene with transcripts, and optionally call orfs start = min( x[0] for x in chain(gene_elements.tss_exons, gene_elements.tes_exons, gene_elements.promoter, gene_elements.polyas)) stop = max(x[1] for x in chain(gene_elements.tss_exons, gene_elements.tes_exons, gene_elements.promoter, gene_elements.polyas)) try: config.log_statement( "Building transcripts and ORFs for %s (%s:%s:%i-%i)" % (gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop)) gene = build_gene(gene_elements, fasta, ref_genes) if gene == None: return config.log_statement( "FINISHED Building transcript and ORFs for Gene %s" % gene.id) # dump a pickle of the gene to a temp file, and set that in the # output manager ofname = gene.write_to_file( config.get_gene_tmp_fname(gene.id, SAMPLE_TYPE, REP_ID)) output.put((gene.id, len(gene.transcripts), ofname)) write_gene_to_gtf(gtf_ofp, gene) write_gene_to_tracking_file(tracking_ofp, gene) except TooManyCandidateTranscriptsError: config.log_statement( "Too many candidate transcripts in %s(%s:%s:%i-%i)" % (gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop), log=True) return except Exception, inst: config.log_statement( "ERROR building transcript in %s(%s:%s:%i-%i): %s" % (gene_elements.id, gene_elements.chrm, gene_elements.strand, start, stop, inst), log=True) if config.DEBUG_VERBOSE: config.log_statement(traceback.format_exc(), log=True)
def feed_elements(raw_elements, elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes): all_args = multiprocessing.Queue() for (contig, strand), grpd_exons in raw_elements.iteritems(): all_args.put([(contig, strand), dict(grpd_exons)]) for i in xrange(config.NTHREADS): all_args.put('FINISHED') num_add_element_threads = min(len(raw_elements), config.NTHREADS) gene_id_cntr = multiprocessing.Value('i', 0) nthreads_remaining = multiprocessing.Value('i', num_add_element_threads) worker_args = [ all_args, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ] cluster_pids = [] for i in xrange(num_add_element_threads): pid = os.fork() if pid == 0: add_elements_for_contig_and_strand_worker(*worker_args) with nthreads_remaining.get_lock(): nthreads_remaining.value -= 1 config.log_statement("Finished adding elements (%i left)" % nthreads_remaining.value) build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes) os._exit(0) cluster_pids.append(pid) while True: with nthreads_remaining.get_lock(): if nthreads_remaining.value == 0: for i in xrange(config.NTHREADS + 1): elements.put('FINISHED') break time.sleep(1.0) for pid in cluster_pids: os.waitpid(pid, 0) config.log_statement("Finished adding elements") return
def estimate_read_and_control_cov_in_gene( gene, signal_reads, reads_type, rnaseq_reads, alpha=0.01): assert reads_type in ('promoter', 'polya') reads_type = '5p' if reads_type == 'promoter' else '3p' if gene.strand == '-': reads_type = {'3p':'5p', '5p':'3p'}[reads_type] signal_cov = gene.find_coverage(signal_reads) if DEBUG_VERBOSE: config.log_statement("Finished building signal coverage array") #signal_cov = build_false_signal(rnaseq_reads, '5p') control_cov = build_control_in_gene_regions( gene, rnaseq_reads, reads_type, SMOOTH_WIN_LEN) if DEBUG_VERBOSE: config.log_statement("Finished building control coverage array") return signal_cov, control_cov
def get_elements_from_gene( gene, get_tss=True, get_jns=True, \ get_tes=True, get_exons=False ): tss_exons = set() tes_exons = set() introns = set() exons = set() chrm, strand = clean_chr_name(gene.chrm), gene.strand transcripts = gene.transcripts for trans in transcripts: bndries = trans.exon_bnds fp_region = GenomicInterval(chrm, strand, bndries[0], bndries[1]) tp_region = GenomicInterval(chrm, strand, bndries[-2], bndries[-1]) if strand == '+': if get_tss: tss_exons.add( fp_region ) if get_tes: tes_exons.add( tp_region ) else: if strand != '-': config.log_statement("BADBADBAD", strand) continue assert strand == '-' if get_tss: tss_exons.add( tp_region ) if get_tes: tes_exons.add( fp_region ) if get_jns: for start, stop in izip( bndries[1:-2:2], bndries[2:-1:2] ): # add and subtract 1 to ge tthe inclusive intron boundaries, # rather than the exon boundaries if start >= stop: continue introns.add( GenomicInterval(chrm, strand, start+1, stop-1) ) if get_exons: for start, stop in izip( bndries[::2], bndries[1::2] ): exons.add( GenomicInterval(chrm, strand, start, stop) ) return tss_exons, introns, tes_exons, exons
def remove_overlapping_elements( tes_dict, elements_I, w ): ''' Remove all elements (tes's) overlapping another element type ''' start = w end = w+1 over = dict() for (chrm,strand) in tes_dict.keys(): if not elements_I.has_key((chrm,strand)): config.log_statement( "warning, element_intersecter does not contain the chrm: ", chrm) continue for tes in tes_dict[(chrm,strand)].keys(): H = elements_I[(chrm,strand)].find(tes-start,tes+end) if not H: if not over.has_key( (chrm,strand) ): over[ (chrm,strand) ] = dict() over[ (chrm,strand) ][tes] = copy.deepcopy( tes_dict[ (chrm,strand) ][tes] ) return over
def estimate_read_and_control_cov_in_gene(gene, signal_reads, reads_type, rnaseq_reads, alpha=0.01): assert reads_type in ('promoter', 'polya') reads_type = '5p' if reads_type == 'promoter' else '3p' if gene.strand == '-': reads_type = {'3p': '5p', '5p': '3p'}[reads_type] signal_cov = gene.find_coverage(signal_reads) if DEBUG_VERBOSE: config.log_statement("Finished building signal coverage array") #signal_cov = build_false_signal(rnaseq_reads, '5p') control_cov = build_control_in_gene_regions(gene, rnaseq_reads, reads_type, SMOOTH_WIN_LEN) if DEBUG_VERBOSE: config.log_statement("Finished building control coverage array") return signal_cov, control_cov
def parse_fasta( fn ): ''' load a fasta file into a dictionary pointing to sinlge strings, one for each chromosome ''' genome = dict() fid = open(fn) chrm = '' for line in fid: data = line.strip() if data.startswith('>'): chrm = clean_chr_name(data[1:]) else: if not genome.has_key(chrm): genome[chrm] = [] config.log_statement(chrm) genome[chrm].append(data.lower()) for chrm in genome.keys(): genome[chrm] = ''.join(genome[chrm]) fid.close() return genome
def get_new_gene(): # get a gene to process try: gene_id = gene_ids.get(timeout=0.1) except Queue.Empty: assert gene_ids.qsize() == 0 config.log_statement("") raise IndexError, "No genes left" config.log_statement( "Loading design matrix for gene '%s'" % gene_id) gene = data.get_gene(gene_id) try: f_mat = data.get_design_matrix(gene_id) except NoDesignMatrixError: if config.DEBUG_VERBOSE: config.log_statement("No design matrix for '%s'" % gene_id, log=True) raise mle_estimate = data.get_mle(gene_id) trans_indices = [] for row_num, t_index in enumerate(f_mat.transcript_indices()): trans_indices.append((t_index, row_num+1, bnd_type)) cntr = trans_index_cntrs[gene_id] with cntr.get_lock(): if cntr.value == -1000: cntr.value = len(trans_indices)-1 return gene, f_mat, mle_estimate, trans_indices, cntr
def nnls_cvxopt(X, Y, fixed_indices_and_values={}): from cvxopt import matrix, solvers X = matrix(X) Y = matrix(Y) m, n = X.size num_constraint = len(fixed_indices_and_values) G = matrix(0.0, (n, n)) G[::n + 1] = -1.0 h = matrix(-MIN_TRANSCRIPT_FREQ, (n, 1)) # Add the equality constraints A = matrix(0., (1 + num_constraint, n)) b = matrix(0., (1 + num_constraint, 1)) # Add the sum to one constraint A[0, :] = 1. b[0, 0] = 1. # Add the fixed value constraints for const_i, (i, val) in enumerate(fixed_indices_and_values.iteritems()): A[const_i + 1, i] = 1. b[const_i + 1, 0] = val solvers.options['show_progress'] = DEBUG_OPTIMIZATION res = solvers.qp(P=X.T * X, q=-X.T * Y, G=G, h=h, A=A, b=b) x = numpy.array(res['x']).T[0, ] rss = ((numpy.array(X * res['x'] - Y)[0, ])**2).sum() if DEBUG_OPTIMIZATION: for key, val in res.iteritems(): if key in 'syxz': continue config.log_statement("%s:\t%s" % (key.ljust(22), val)) config.log_statement("RSS: ".ljust(22) + str(rss)) x[x < MIN_TRANSCRIPT_FREQ] = MIN_TRANSCRIPT_FREQ x = project_onto_simplex(x) return x
def estimate_confidence_bounds( data, bnd_type ): config.log_statement( "Populating estimate confidence bounds queue.") ## populate the queue # sort so that the biggest genes are processed first gene_ids = multiprocessing.Queue() trans_index_cntrs = {} sorted_gene_ids = sorted(data.gene_ids, key=lambda x:data.gene_ntranscripts_mapping[x], reverse=True) for i, gene_id in enumerate(sorted_gene_ids): gene_ids.put(gene_id) trans_index_cntrs[gene_id] = multiprocessing.Value( 'i', -1000) config.log_statement("Waiting on gene bounds children") if False and config.NTHREADS == 1: find_confidence_bounds_worker( data, gene_ids, trans_indices_queues, bnd_type ) else: pids = [] for i in xrange(config.NTHREADS): pid = os.fork() if pid == 0: try: find_confidence_bounds_worker( data, gene_ids, trans_index_cntrs, bnd_type) except Exception, inst: config.log_statement( traceback.format_exc(), log=True ) finally: os._exit(0)
def extract_genome_sequence( genome, tes_dict, w ): ''' Return an array of sequences each of size 2*w + 1 ''' seqs = [] start = w end = w+1 for (chrm,strand) in tes_dict.keys(): if not genome.has_key(chrm): config.log_statement( "warning, genome sequence does not contain the chrm: ", chrm) continue for tes in tes_dict[(chrm,strand)].keys(): seq = genome[chrm][tes-start:tes+end] if strand == "-": seqs.append([[chrm,strand,tes,tes_dict[(chrm,strand)][tes]], reverse_strand(seq)]) else: assert strand == "+" seqs.append([[chrm,strand,tes,tes_dict[(chrm,strand)][tes]], seq]) return seqs
def write_data_to_tracking_file(data, fl_dists, ofp): num_reads_in_bams = data.get_num_reads_in_bams() ofp.write("\t".join( ["tracking_id", "gene_id ", "coverage", "FPKM ", "FPKM_lo ", "FPKM_hi ", "status"] ) + "\n") try: sorted_gene_ids = sorted( data.gene_ids, key=lambda x: int(x.split("_")[-1])) except: sorted_gene_ids = data.gene_ids for gene_id in sorted_gene_ids: try: lines = build_gene_lines_for_tracking_file( gene_id, data, num_reads_in_bams, fl_dists) except Exception, inst: config.log_statement("Skipping '%s': %s" % (gene_id, str(inst))) config.log_statement( traceback.format_exc(), log=True ) else: ofp.write("\n".join(lines) + "\n" )
def get_RNAseq_density_worker( reads, sites, sites_lock, dense ): while True: with sites_lock: sites_len = len( sites ) if sites_len == 0: break # using the commented out code appears slower because # some regions ( like M ) have so many reads, that them # all getting stuck in 1 group outweighs the lock overhead # of doing 1 at a time. A random sort might fix this, but it # seems fast enough as is. args = [sites.pop(),] #[-1:] #del sites[-1:] if DEBUG_VERBOSE and sites_len%1000 == 0: config.log_statement("%i polyA sites remain" % sites_len) for chrm, strand, pos, cnt in args: key = '_'.join([chrm,strand,str(pos)]) predictors = get_predictors_for_polya_site( reads, chrm, strand, pos ) if not dense.has_key(key): dense[key] = predictors else: dense[key] = dense[key] + predictors return
def estimate_mles( data ): config.log_statement("Initializing MLE queue") gene_ids = multiprocessing.Queue() sorted_gene_ids = sorted(data.gene_ids, key=lambda x:data.gene_ntranscripts_mapping[x], reverse=True) # sort so that the biggest genes are processed first args = [ gene_ids, data ] if False and config.NTHREADS == 1: estimate_mle_worker(*args) else: ps = [] for i in xrange(config.NTHREADS): pid = os.fork() if pid == 0: try: estimate_mle_worker(*args) except Exception, inst: config.log_statement( str(error_msg), log=True ) config.log_statement( traceback.format_exc(), log=True ) finally: os._exit(0)
def build_transcripts(exons_bed_fp, gtf_ofname, tracking_ofname, fasta_fp=None, ref_genes=None, sample_type=None, rep_id=None): """Build transcripts """ # set the sample ype and rep id for the output tmp file names global SAMPLE_TYPE SAMPLE_TYPE = sample_type global REP_ID REP_ID = rep_id # make sure that we're starting from the start of the # elements files config.log_statement( "Loading %s" % exons_bed_fp.name, log=True ) exons_bed_fp.seek(0) raw_elements = load_elements( exons_bed_fp ) config.log_statement( "Finished Loading %s" % exons_bed_fp.name ) gtf_ofp = ThreadSafeFile(gtf_ofname + ".unfinished", "w") gtf_ofp.write("track name=%s useScore=1\n" % ".".join(gtf_ofname.split(".")[:-1])) tracking_ofp = ThreadSafeFile(tracking_ofname + ".unfinished", "w") tracking_ofp.write("\t".join( ["tracking_id".ljust(20), "class_code", "nearest_ref_id".ljust(20), "gene_id".ljust(20), "gene_short_name".ljust(20), "tss_id".ljust(10), "locus".ljust(30), "length"]) + "\n") config.log_statement( "Building Transcripts", log=True ) manager = multiprocessing.Manager() elements = manager.Queue(2*config.NTHREADS) output = manager.Queue() transcript_building_children_args = [ elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes] pids = [] for i in xrange(max(0,config.NTHREADS - len(raw_elements))): pid = os.fork() if pid == 0: build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes) os._exit(0) pids.append(pid) elements_feeder_pid = os.fork() if elements_feeder_pid == 0: feed_elements( raw_elements, elements, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes ) os._exit(0) for pid in pids: os.waitpid(pid, 0) os.waitpid(elements_feeder_pid, 0) genes = [] while output.qsize() > 0: try: genes.append(output.get_nowait()) except Queue.Empty: continue assert len(genes) == len(set(genes)) config.log_statement("Finished building transcripts") gtf_ofp.close() tracking_ofp.close() # we store to unfinished so we know if it errors out early shutil.move(gtf_ofname + ".unfinished", gtf_ofname) shutil.move(tracking_ofname + ".unfinished", tracking_ofname) manager.shutdown() return genes
def estimate_mle_worker( gene_ids, data ): while True: config.log_statement("Retrieving gene from queue") gene_id = gene_ids.get() if gene_id == 'FINISHED': config.log_statement("") return try: config.log_statement( "Loading gene %s" % gene_id ) gene = data.get_gene(gene_id) config.log_statement( "Finding MLE for Gene %s(%s:%s:%i-%i) - %i transcripts" \ % (gene.id, gene.chrm, gene.strand, gene.start, gene.stop, len(gene.transcripts) ) ) try: f_mat = data.get_design_matrix(gene_id) except NoDesignMatrixError: if config.DEBUG_VERBOSE: config.log_statement("No design matrix for '%s'" % gene_id, log=True) continue num_reads_in_bams = data.get_num_reads_in_bams() expected_array, observed_array = f_mat.expected_and_observed( num_reads_in_bams) if (expected_array, observed_array) == (None, None): continue mle = frequency_estimation.estimate_transcript_frequencies( observed_array, expected_array) except Exception, inst: error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % ( os.getpid(), gene.id, gene.chrm, gene.strand, gene.start, gene.stop, inst) config.log_statement( error_msg, log=True ) config.log_statement( traceback.format_exc(), log=True ) continue log_lhd = frequency_estimation.calc_lhd( mle, observed_array, expected_array) # add back in the missing trasncripts full_mle = -1*numpy.ones(len(gene.transcripts)+1, dtype=float) full_mle[numpy.array([-1,]+f_mat.transcript_indices().tolist())+1] = mle data.set_mle(gene, full_mle) config.log_statement( "FINISHED MLE %s\t%.2f - updating queues" % ( gene.id, log_lhd ) )
def find_confidence_bounds_worker( data, gene_ids, trans_index_cntrs, bnd_type ): def get_new_gene(): # get a gene to process try: gene_id = gene_ids.get(timeout=0.1) except Queue.Empty: assert gene_ids.qsize() == 0 config.log_statement("") raise IndexError, "No genes left" config.log_statement( "Loading design matrix for gene '%s'" % gene_id) gene = data.get_gene(gene_id) try: f_mat = data.get_design_matrix(gene_id) except NoDesignMatrixError: if config.DEBUG_VERBOSE: config.log_statement("No design matrix for '%s'" % gene_id, log=True) raise mle_estimate = data.get_mle(gene_id) trans_indices = [] for row_num, t_index in enumerate(f_mat.transcript_indices()): trans_indices.append((t_index, row_num+1, bnd_type)) cntr = trans_index_cntrs[gene_id] with cntr.get_lock(): if cntr.value == -1000: cntr.value = len(trans_indices)-1 return gene, f_mat, mle_estimate, trans_indices, cntr def get_gene_being_processed(): longest_gene_id = None gene_len = 0 for gene_id, cntr in trans_index_cntrs.iteritems(): value = cntr.value if value > gene_len: longest_gene_id = gene_id gene_len = value if longest_gene_id == None: return None gene = data.get_gene(longest_gene_id) f_mat = data.get_design_matrix(longest_gene_id) mle_estimate = data.get_mle(longest_gene_id) trans_indices = [] for row_num, t_index in enumerate(f_mat.transcript_indices()): trans_indices.append((t_index, row_num+1, bnd_type)) return ( gene, f_mat, mle_estimate, trans_indices, trans_index_cntrs[longest_gene_id] ) no_new_genes = False num_reads_in_bams = data.get_num_reads_in_bams() while True: try: try: gene, f_mat, mle_estimate, trans_indices, cntr = get_new_gene() except NoDesignMatrixError: continue except IndexError: res = get_gene_being_processed() if res == None: break gene, f_mat, mle_estimate, trans_indices, cntr = res cbs = find_confidence_bounds_in_gene( gene, num_reads_in_bams, f_mat, mle_estimate, trans_indices, cntr, cb_alpha=config.CB_SIG_LEVEL) data.set_cbs(gene.id, cbs) if config.VERBOSE: config.log_statement("Finished processing '%s'" % gene.id) except Exception, inst: config.log_statement( traceback.format_exc(), log=True )
def call_peaks( signal_cov, original_control_cov, reads_type, gene, alpha, min_noise_frac, min_merge_size, min_rel_merge_size, min_rd_cnt, trim_fraction, min_peak_size, max_peak_size, max_exp_sum_fraction, max_exp_mean_cvg_fraction): signal = numpy.ones(len(signal_cov)) for k in xrange(N_REPS): noise_frac = 1.0 noise_regions = [(0, len(signal)),] reg_coef, control_cov = \ update_control_cov_for_five_prime_bias( noise_regions, noise_frac, signal_cov, original_control_cov, reads_type) for i in xrange(MAX_NUM_ITERATIONS): if DEBUG_VERBOSE: region = {'chrm': gene.chrm, 'strand': gene.strand, 'start': gene.start, 'stop': gene.stop} write_bedgraph_from_array( 1000*control_cov, region, "control.%i"%i) write_bedgraph_from_array( signal_cov, region, "signal.%i"%i) config.log_statement( "Iter %i: Noise Frac %.2f%%\tReg Coef: %s" % ( i+1, noise_frac*100, reg_coef)) noise_regions = find_noise_regions( signal_cov, control_cov, noise_frac, alpha=alpha, min_peak_size=min_peak_size ) new_noise_frac = estimate_noise_frac( noise_regions, signal_cov, control_cov, min_noise_frac) new_reg_coef, control_cov = \ update_control_cov_for_five_prime_bias( noise_regions, noise_frac, signal_cov, original_control_cov, reads_type) if noise_frac - new_noise_frac <= 1e-3 \ and abs(reg_coef[0] - new_reg_coef[0]) < 1e-3 \ and abs(reg_coef[1] - new_reg_coef[1]) < 1e-3: break else: noise_frac = new_noise_frac reg_coef = new_reg_coef for start, stop in noise_regions: signal[start:stop] -= 1./N_REPS # build a list of inclusive peak starts and stops peaks = [] nonzero_bases = (signal>1e-6).nonzero()[0].tolist() if len(nonzero_bases) == 0: return peaks curr_start = nonzero_bases.pop(0) curr_stop = curr_start for base in nonzero_bases: if base == curr_stop+1: curr_stop += 1 else: peaks.append((curr_start, curr_stop)) curr_start, curr_stop = base, base peaks.append((curr_start, curr_stop)) while True: new_peaks = merge_adjacent_intervals( peaks, min_merge_size, min_rel_merge_size, max_peak_size) if len(new_peaks) == len(peaks): peaks = new_peaks break else: peaks = new_peaks # trim peaks new_peaks = [] for start, stop in peaks: assert stop >= start cov_region = signal_cov[start:stop+1] total_cov = cov_region.sum() cov_cumsum = cov_region.cumsum()-cov_region[0] try: trim_start = numpy.flatnonzero( cov_cumsum < int(trim_fraction*total_cov)).max() except: trim_start = 0 try: trim_stop = numpy.flatnonzero( cov_cumsum > (1.0-trim_fraction)*total_cov).min() except: trim_stop=len(cov_region)-1 while trim_start < len(cov_region)-1 and cov_region[trim_start] == 0: trim_start += 1 while trim_stop > trim_start and cov_region[trim_stop] == 0: trim_stop -= 1 new_peaks.append((trim_start+start, trim_stop+start, cov_region[trim_start:trim_stop+1].sum())) # filter peaks exp_filtered_peaks = [] max_peak_cnt = float(max(cnt for start, stop, cnt in new_peaks)) max_peak_mean_cnt = float(max(cnt/float(stop-start+1) for start, stop, cnt in new_peaks)) for start, stop, cnt in new_peaks: length = stop - start + 1 if (cnt >= min_rd_cnt and length >= min_peak_size and length <= max_peak_size and cnt/max_peak_cnt > max_exp_sum_fraction and (cnt/float(length))/max_peak_mean_cnt > max_exp_mean_cvg_fraction ): exp_filtered_peaks.append((start, stop, cnt)) return exp_filtered_peaks
def find_confidence_bounds_in_gene( gene, num_reads_in_bams, f_mat, mle_estimate, trans_indices, cntr, cb_alpha): # update the mle_estimate array to only store observable transcripts # add 1 to skip the out of gene bin observable_trans_indices = ( numpy.array([-1,] + f_mat.transcript_indices().tolist())+1 ) mle_estimate = mle_estimate[observable_trans_indices] if config.VERBOSE: config.log_statement( "Estimating confidence bounds for gene %s" % gene.id ) #n_skipped = sum( 1 for x in sorted(f_mat.filtered_transcripts) # if x < trans_indices[0]) # XXX Make sure that this is beign counted correctly #n_skipped_tmp = len(set(xrange(trans_indices[0])) - \ # set(x-1 for x in observable_trans_indices[1:] if x-1 < trans_indices[0])) #config.log_statement( str([n_skipped_tmp, n_skipped, f_mat.filtered_transcripts, \ # observable_trans_indices, trans_indices]), log=True) #assert n_skipped == n_skipped_tmp res = [] while True: with cntr.get_lock(): index = cntr.value if index == -1: config.log_statement('') break cntr.value -= 1 trans_index, exp_mat_row, bnd_type = trans_indices[index] config.log_statement( "Estimating %s confidence bound for gene %s (%i/%i remain)" % ( bnd_type, gene.id, cntr.value+1, len(gene.transcripts))) try: p_value, bnd = frequency_estimation.estimate_confidence_bound( f_mat, num_reads_in_bams, exp_mat_row, mle_estimate, bnd_type, cb_alpha ) except Exception, inst: p_value = 1. bnd = 0.0 if bnd_type == 'lb' else 1.0 error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % ( os.getpid(), gene.id, gene.chrm, gene.strand, gene.start, gene.stop, inst) config.log_statement( error_msg, log=True ) config.log_statement( traceback.format_exc(), log=True ) if config.DEBUG_VERBOSE: config.log_statement( "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" % ( bnd_type, gene.id, None, trans_index, len(gene.transcripts), bnd, p_value ) ) res.append((bnd_type, trans_index, bnd))
bnd = 0.0 if bnd_type == 'lb' else 1.0 error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % ( os.getpid(), gene.id, gene.chrm, gene.strand, gene.start, gene.stop, inst) config.log_statement( error_msg, log=True ) config.log_statement( traceback.format_exc(), log=True ) if config.DEBUG_VERBOSE: config.log_statement( "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" % ( bnd_type, gene.id, None, trans_index, len(gene.transcripts), bnd, p_value ) ) res.append((bnd_type, trans_index, bnd)) if config.VERBOSE: config.log_statement( "FINISHED Estimating confidence bound for gene %s" % gene.id ) return res def find_confidence_bounds_worker( data, gene_ids, trans_index_cntrs, bnd_type ): def get_new_gene(): # get a gene to process try: gene_id = gene_ids.get(timeout=0.1) except Queue.Empty: assert gene_ids.qsize() == 0 config.log_statement("") raise IndexError, "No genes left" config.log_statement(
def call_peaks(signal_cov, original_control_cov, reads_type, gene, alpha, min_noise_frac, min_merge_size, min_rel_merge_size, min_rd_cnt, trim_fraction, min_peak_size, max_peak_size, max_exp_sum_fraction, max_exp_mean_cvg_fraction): signal = numpy.ones(len(signal_cov)) for k in xrange(N_REPS): noise_frac = 1.0 noise_regions = [ (0, len(signal)), ] reg_coef, control_cov = \ update_control_cov_for_five_prime_bias( noise_regions, noise_frac, signal_cov, original_control_cov, reads_type) for i in xrange(MAX_NUM_ITERATIONS): if DEBUG_VERBOSE: region = { 'chrm': gene.chrm, 'strand': gene.strand, 'start': gene.start, 'stop': gene.stop } write_bedgraph_from_array(1000 * control_cov, region, "control.%i" % i) write_bedgraph_from_array(signal_cov, region, "signal.%i" % i) config.log_statement( "Iter %i: Noise Frac %.2f%%\tReg Coef: %s" % (i + 1, noise_frac * 100, reg_coef)) noise_regions = find_noise_regions(signal_cov, control_cov, noise_frac, alpha=alpha, min_peak_size=min_peak_size) new_noise_frac = estimate_noise_frac(noise_regions, signal_cov, control_cov, min_noise_frac) new_reg_coef, control_cov = \ update_control_cov_for_five_prime_bias( noise_regions, noise_frac, signal_cov, original_control_cov, reads_type) if noise_frac - new_noise_frac <= 1e-3 \ and abs(reg_coef[0] - new_reg_coef[0]) < 1e-3 \ and abs(reg_coef[1] - new_reg_coef[1]) < 1e-3: break else: noise_frac = new_noise_frac reg_coef = new_reg_coef for start, stop in noise_regions: signal[start:stop] -= 1. / N_REPS # build a list of inclusive peak starts and stops peaks = [] nonzero_bases = (signal > 1e-6).nonzero()[0].tolist() if len(nonzero_bases) == 0: return peaks curr_start = nonzero_bases.pop(0) curr_stop = curr_start for base in nonzero_bases: if base == curr_stop + 1: curr_stop += 1 else: peaks.append((curr_start, curr_stop)) curr_start, curr_stop = base, base peaks.append((curr_start, curr_stop)) while True: new_peaks = merge_adjacent_intervals(peaks, min_merge_size, min_rel_merge_size, max_peak_size) if len(new_peaks) == len(peaks): peaks = new_peaks break else: peaks = new_peaks # trim peaks new_peaks = [] for start, stop in peaks: assert stop >= start cov_region = signal_cov[start:stop + 1] total_cov = cov_region.sum() cov_cumsum = cov_region.cumsum() - cov_region[0] try: trim_start = numpy.flatnonzero( cov_cumsum < int(trim_fraction * total_cov)).max() except: trim_start = 0 try: trim_stop = numpy.flatnonzero( cov_cumsum > (1.0 - trim_fraction) * total_cov).min() except: trim_stop = len(cov_region) - 1 while trim_start < len(cov_region) - 1 and cov_region[trim_start] == 0: trim_start += 1 while trim_stop > trim_start and cov_region[trim_stop] == 0: trim_stop -= 1 new_peaks.append((trim_start + start, trim_stop + start, cov_region[trim_start:trim_stop + 1].sum())) # filter peaks exp_filtered_peaks = [] max_peak_cnt = float(max(cnt for start, stop, cnt in new_peaks)) max_peak_mean_cnt = float( max(cnt / float(stop - start + 1) for start, stop, cnt in new_peaks)) for start, stop, cnt in new_peaks: length = stop - start + 1 if (cnt >= min_rd_cnt and length >= min_peak_size and length <= max_peak_size and cnt / max_peak_cnt > max_exp_sum_fraction and (cnt / float(length)) / max_peak_mean_cnt > max_exp_mean_cvg_fraction): exp_filtered_peaks.append((start, stop, cnt)) return exp_filtered_peaks
def quantify_transcript_expression( promoter_reads, rnaseq_reads, polya_reads, pickled_gene_fnames, ofname, sample_type=None, rep_id=None ): """Build transcripts """ assert rnaseq_reads.fl_dists != None global SAMPLE_ID SAMPLE_ID=sample_type global REP_ID REP_ID = rep_id write_design_matrices=False if config.VERBOSE: config.log_statement( "Initializing processing data" ) data = SharedData(pickled_gene_fnames) if config.VERBOSE: config.log_statement( "Building design matrices" ) build_design_matrices( data, rnaseq_reads.fl_dists, (rnaseq_reads, promoter_reads, polya_reads)) if config.VERBOSE: config.log_statement( "Populating input queue from expression queue" ) data.populate_expression_queue() if config.VERBOSE: config.log_statement( "Estimating MLEs" ) estimate_mles( data ) if config.VERBOSE: config.log_statement( "Calculating FPKMS and Writing mle's to output mle" ) if config.ESTIMATE_LOWER_CONFIDENCE_BOUNDS: if config.VERBOSE: config.log_statement( "Estimating lower confidence bounds" ) estimate_confidence_bounds(data, 'lb') if config.VERBOSE: config.log_statement( "FINISHED Estimating lower confidence bounds" ) if config.ESTIMATE_UPPER_CONFIDENCE_BOUNDS: if config.VERBOSE: config.log_statement( "Estimating upper confidence bounds" ) estimate_confidence_bounds(data, 'ub') if config.VERBOSE: config.log_statement( "FINISHED Estimating upper confidence bounds" ) if config.VERBOSE: config.log_statement( "Writing output data to tracking file" ) expression_ofp = ThreadSafeFile(ofname, "w") write_data_to_tracking_file(data, rnaseq_reads.fl_dists, expression_ofp) expression_ofp.close() return
def find_all_gene_segments( rnaseq_reads, promoter_reads, polya_reads, ref_genes, ref_elements_to_include, region_to_use=None ): config.log_statement("Finding gene segments") contig_lens = dict(zip(*get_contigs_and_lens( [ reads for reads in [rnaseq_reads, promoter_reads, polya_reads] if reads != None ] ))) config.log_statement("Spawning gene segment finding children") segments_queue = multiprocessing.Queue() global_gene_data = GlobalGeneSegmentData(contig_lens) ref_element_types_to_include = set() if ref_elements_to_include.junctions: ref_element_types_to_include.add('intron') if ref_elements_to_include.TSS: ref_element_types_to_include.add('tss_exon') if ref_elements_to_include.TES: ref_element_types_to_include.add('tes_exon') if ref_elements_to_include.promoters: ref_element_types_to_include.add('promoter') if ref_elements_to_include.polya_sites: ref_element_types_to_include.add('polya') if ref_elements_to_include.exons: ref_element_types_to_include.add('exon') # to give full gene connectivity if ref_elements_to_include.genes: ref_element_types_to_include.add('intron') ref_element_types_to_include.add('exon') pids = [] for i in xrange(config.NTHREADS): pid = os.fork() if pid == 0: find_segments_and_jns_worker( segments_queue, global_gene_data, rnaseq_reads, promoter_reads, polya_reads, ref_genes, ref_element_types_to_include) os._exit(0) pids.append(pid) config.log_statement("Populating gene segment queue") segments = split_genome_into_segments(contig_lens, region_to_use) for segment in segments: segments_queue.put(segment) for i in xrange(config.NTHREADS): segments_queue.put('FINISHED') while segments_queue.qsize() > 2*config.NTHREADS: config.log_statement( "Waiting on gene segment finding children (%i/%i segments remain)" %(segments_queue.qsize(), len(segments))) time.sleep(0.5) for i, pid in enumerate(pids): config.log_statement( "Waiting on gene segment finding children (%i/%i children remain)" %(len(pids)-i, len(pids))) os.waitpid(pid, 0) config.log_statement("Merging gene segments") merged_transcribed_regions = {} for key, intervals in global_gene_data.transcribed_regions.iteritems(): merged_transcribed_regions[ key] = merge_adjacent_intervals( intervals, config.MAX_EMPTY_REGION_SIZE) transcribed_regions = merged_transcribed_regions config.log_statement("Filtering junctions") filtered_jns = defaultdict(dict) for contig in contig_lens.keys(): plus_jns = defaultdict(int) for jn, cnt in global_gene_data.jns[(contig, '+')]: plus_jns[jn] += cnt minus_jns = defaultdict(int) for jn, cnt in global_gene_data.jns[(contig, '-')]: minus_jns[jn] += cnt filtered_jns[(contig, '+')] = filter_jns(plus_jns, minus_jns) filtered_jns[(contig, '-')] = filter_jns(minus_jns, plus_jns) config.log_statement("Building FL dist") fl_dists = build_fl_dists_from_fls_dict(dict(global_gene_data.frag_lens)) if ref_elements_to_include.junctions: for gene in ref_genes: for jn in gene.extract_elements()['intron']: if jn not in filtered_jns[(gene.chrm, gene.strand)]: filtered_jns[(gene.chrm, gene.strand)][jn] = 0 config.log_statement("Clustering gene segments") # build bins for all of the genes and junctions, converting them to 1-based # in the process new_genes = [] new_introns = [] for contig, contig_len in contig_lens.iteritems(): for strand in '+-': key = (contig, strand) jns = [ (start, stop, cnt) for (start, stop), cnt in sorted(filtered_jns[key].iteritems()) ] for start, stop, cnt in jns: new_introns.append( SegmentBin(start, stop, ["D_JN",], ["R_JN",], "INTRON")) intervals = cluster_intron_connected_segments( transcribed_regions[key], [(start, stop) for start, stop, cnt in jns ] ) # add the intergenic space, since there could be interior genes for segments in intervals: new_gene = GeneElements( contig, strand ) for start, stop in segments: new_gene.regions.append( SegmentBin(start, stop, ["ESTART",],["ESTOP",],"GENE")) if new_gene.stop-new_gene.start+1 < config.MIN_GENE_LENGTH: continue new_genes.append(new_gene) try: num_unique_reads = ReadCounts(*[ float(x.value) for x in global_gene_data.num_unique_reads]) except AttributeError: num_unique_reads = ReadCounts(*global_gene_data.num_unique_reads) global_gene_data.shutdown() config.log_statement("") return new_genes, fl_dists, num_unique_reads
if key not in grpd_exons: args.append(set()) else: exons = [tuple(x) for x in grpd_exons[key].tolist() if x[0] >= g_start and x[1] <= g_stop] args.append(set(exons)) yield args def add_elements_for_contig_and_strand((contig, strand), grpd_exons, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp, fasta_fp, ref_genes): if fasta_fp != None: fasta = Fastafile(fasta_fp.name) else: fasta = None config.log_statement( "Clustering elements into genes for %s:%s" % ( contig, strand ) ) """ old code that actually clustered elements args = [] for key in ('tss_exon', 'internal_exon', 'tes_exon', 'single_exon_gene', 'promoter', 'polya', 'intron'): if key not in grpd_exons: args.append(set()) else: args.append( set(map(tuple, grpd_exons[key].tolist()))) args.append(strand) """ for ( tss_es, internal_es, tes_es, se_ts, promoters, polyas, jns ) in group_elements_in_gene(grpd_exons): # skip genes without all of the element types
def find_segments_and_jns_worker( segments, global_gene_data, rnaseq_reads, promoter_reads, polya_reads, ref_elements, ref_elements_to_include ): rnaseq_reads = rnaseq_reads.reload() if promoter_reads != None: promoter_reads = promoter_reads.reload() if polya_reads != None: polya_reads = polya_reads.reload() local_frag_lens = defaultdict(int) local_transcribed_regions = defaultdict(list) local_jns = defaultdict(list) local_rd_cnts = [0.0, 0.0, 0.0] # just use this to keep track of where we are in the queue length_of_segments = segments.qsize() while True: try: config.log_statement("Waiting for segment") segment = segments.get(timeout=1.0) except Queue.Empty: continue if segment == 'FINISHED': config.log_statement("") break config.log_statement("Finding genes and jns in %s" % str(segment) ) try: ( r_transcribed_regions, r_jns, r_n_unique_reads, r_frag_lens, ) = find_transcribed_regions_and_jns_in_segment( segment, rnaseq_reads, promoter_reads, polya_reads, ref_elements, ref_elements_to_include) except TooManyReadsError: seg1 = list(segment) seg1[2] = segment[1] + (segment[2]-segment[1])/2 seg2 = list(segment) seg2[1] = seg1[2] segments.put(seg1) segments.put(seg2) config.log_statement("") continue for (rd_key, rls), fls in r_frag_lens.iteritems(): for fl, cnt in fls.iteritems(): local_frag_lens[(rd_key, rls, fl)] += cnt local_transcribed_regions[(segment[0], '+')].extend([ (start+segment[1], stop+segment[1]) for start, stop in r_transcribed_regions['+']]) local_transcribed_regions[(segment[0], '-')].extend([ (start+segment[1], stop+segment[1]) for start, stop in r_transcribed_regions['-']]) local_jns[(segment[0], '+')].extend(r_jns['+']) local_jns[(segment[0], '-')].extend(r_jns['-']) for i, val in enumerate(r_n_unique_reads): local_rd_cnts[i] += val if sum(local_rd_cnts) > 1e5: global_gene_data.update_all_data( local_frag_lens, local_transcribed_regions, local_jns, local_rd_cnts) local_frag_lens = defaultdict(int) local_transcribed_regions = defaultdict(list) local_jns = defaultdict(list) local_rd_cnts = [0.0, 0.0, 0.0] global_gene_data.update_all_data( local_frag_lens, local_transcribed_regions, local_jns, local_rd_cnts) return