コード例 #1
0
    def __init__(self, fl_min, fl_max, fl_density, stats=None):
        assert fl_min <= fl_max
        assert fl_max < 100000
        assert abs(sum(fl_density) - 1.) < 1e-6
        try:
            assert type(fl_min) == int
            assert type(fl_max) == int
            assert len(fl_density) == (fl_max - fl_min + 1)
        except:
            config.log_statement(fl_density)
            config.log_statement((fl_min, fl_max))
            raise

        self.fl_min = fl_min
        self.fl_max = fl_max
        self.fl_density = fl_density
        self.fl_density_cumsum = fl_density.cumsum()
        # cumsum weighted by the fragment length, just a caching optimization
        self.fl_density_weighted_cumsum = \
            self.fl_density*numpy.arange( fl_min, fl_max+1 )
        self.fl_density_weighted_cumsum = self.fl_density_weighted_cumsum.cumsum(
        )

        # build and set the hash value
        self._hash_value = hash( ( self.fl_min, self.fl_max, \
                                   tuple( self.fl_density ) ) )
        self.stats = stats
コード例 #2
0
    def _build_rnaseq_arrays(self, gene, rnaseq_reads, fl_dists):
        # bin the rnaseq reads
        expected_rnaseq_cnts, observed_rnaseq_cnts = \
            build_expected_and_observed_rnaseq_counts(
                gene, rnaseq_reads, fl_dists )
        clustered_bins = cluster_bins(expected_rnaseq_cnts)
        #for cluster in clustered_bins:
        #    print cluster
        #print

        # if no transcripts are observable given the fl dist, then return nothing
        if len(expected_rnaseq_cnts) == 0:
            self.array_types.append('RNASeq')
            self.obs_cnt_arrays.append(None)
            self.expected_freq_arrays.append(None)
            return

        # build the expected and observed counts, and convert them to frequencies
        ( expected_rnaseq_array, observed_rnaseq_array, unobservable_rnaseq_trans ) = \
              build_expected_and_observed_arrays(
                expected_rnaseq_cnts, observed_rnaseq_cnts, normalize=True )

        del expected_rnaseq_cnts, observed_rnaseq_cnts

        if config.DEBUG_VERBOSE:
            config.log_statement("Clustering bins in RNAseq array")
        expected_rnaseq_array, observed_rnaseq_array, clusters = cluster_rows(
            expected_rnaseq_array, observed_rnaseq_array)

        self.array_types.append('RNASeq')
        self.obs_cnt_arrays.append(observed_rnaseq_array)
        self.expected_freq_arrays.append(expected_rnaseq_array)
        self.unobservable_transcripts.update(unobservable_rnaseq_trans)
コード例 #3
0
    def get_new_gene():

        # get a gene to process
        try:
            gene_id = gene_ids.get(timeout=0.1)
        except Queue.Empty:
            assert gene_ids.qsize() == 0
            raise IndexError, "No genes left"

        config.log_statement("Loading design matrix for gene '%s'" % gene_id)

        gene = data.get_gene(gene_id)
        try:
            f_mat = data.get_design_matrix(gene_id)
        except NoDesignMatrixError:
            if config.DEBUG_VERBOSE:
                config.log_statement("No design matrix for '%s'" % gene_id,
                                     log=True)
            raise

        mle_estimate = data.get_mle(gene_id)

        trans_indices = []
        for row_num, t_index in enumerate(f_mat.transcript_indices()):
            trans_indices.append((t_index, row_num + 1, bnd_type))

        cntr = trans_index_cntrs[gene_id]
        with cntr.get_lock():
            if cntr.value == -1000:
                cntr.value = len(trans_indices) - 1

        return gene, f_mat, mle_estimate, trans_indices, cntr
コード例 #4
0
 def set_design_matrix(self, gene_id, f_mat):
     ofname = config.get_fmat_tmp_fname(gene_id, SAMPLE_ID, REP_ID)
     
     # because there's no cache invalidation mechanism, we're only
     # allowed to set the f_mat object once. This also allows us to
     # move the load outside of the lock
     try: assert self.design_mat_filenames[gene_id].value == ''
     except:
         config.log_statement(
             "%s has already had its design matrix set (%s)" % (
                 gene_id, self.design_mat_filenames[gene_id].value ), 
             log=True)
         return
     
     with open(ofname, "w") as ofp:
         pickle.dump(f_mat, ofp)
     
     with self.design_mat_lock: 
         self.design_mat_filenames[gene_id].value = ofname
     
     if f_mat.num_rnaseq_reads != None:
         with self.num_rnaseq_reads.get_lock():
             self.num_rnaseq_reads.value += f_mat.num_rnaseq_reads
     if f_mat.num_fp_reads != None:
         with self.num_cage_reads.get_lock():
             self.num_cage_reads.value += f_mat.num_fp_reads
     if f_mat.num_tp_reads != None:
         with self.num_polya_reads.get_lock():
             self.num_polya_reads.value += f_mat.num_tp_reads
     
     return
コード例 #5
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def get_RNAseq_densities( all_reads, polyAs ):
    '''
    get the local RNA-seq read densities 
    '''
    dense = dict()
    header = []
    for sample in (x.filename for x in all_reads):
        header.extend( [ sample + '_up_10_rd1', 
                         sample + 'down_10_rd1', 
                         sample + '_up_50_rd1', 
                         sample + 'down_50_rd1', 
                         sample + '_up_100_rd1', 
                         sample + 'down_100_rd1', 
                         sample + '_up_down_rat_10_rd1',
                         sample + '_up_down_rat_50_rd1', 
                         sample + '_up_down_rat_100_rd1' ] )
        header.extend( [ sample + '_up_10_rd2', 
                         sample + 'down_10_rd2', 
                         sample + '_up_50_rd2', 
                         sample + 'down_50_rd2', 
                         sample + '_up_100_rd2', 
                         sample + 'down_100_rd2', 
                         sample + '_up_down_rat_10_rd2', 
                         sample + '_up_down_rat_50_rd2', 
                         sample + '_up_down_rat_100_rd2' ] )
        header.extend( [ sample + '_up_down_rat_10_rd1_rd2', 
                         sample + '_up_down_rat_50_rd1_rd2', 
                         sample + '_up_down_rat_100_rd1_rd2' ] )
    
    # process a list of arguments for multithreading
    import multiprocessing
    manager = multiprocessing.Manager()
    dense = manager.dict()
    sites = manager.list()
    sites_lock = manager.Lock()
    
    for reads in all_reads:
        for (chrm, strand), polyA in polyAs.iteritems():
            chrm = clean_chr_name( chrm )
            for pos, cnt in sorted(polyA.iteritems()):
                sites.append( (chrm, strand, pos, cnt) )
    
    if VERBOSE: 
        config.log_statement(
            "Finding RNASeq read coverage around poly(A) sites with %i threads"\
                % NTHREADS)
    if NTHREADS == 1:
        get_RNAseq_density_worker( reads, sites, sites_lock, dense )
    else:
        from lib.multiprocessing_utils import Pool
        all_args = [( reads, sites, sites_lock, dense )]*NTHREADS
        p = Pool(NTHREADS)
        p.apply( get_RNAseq_density_worker, all_args )
    
    if VERBOSE: config.log_statement("FINISHED finding poly(A) coverage")
    
    return dict(dense), header
コード例 #6
0
 def iter_good_exons():
     num = 0
     for (chrm, strand), exons in sorted( elements.iteritems()):
         for start,stop in iter_nonoverlapping_exons(exons):
             num += 1
             yield GenomicInterval(chrm, strand, start, stop)
         if config.DEBUG_VERBOSE:
             config.log_statement("FL ESTIMATION: %s %s" % ((chrm, strand), num ))
     return
コード例 #7
0
def find_confidence_bounds_in_gene(gene, num_reads_in_bams, f_mat,
                                   mle_estimate, trans_indices, cntr,
                                   cb_alpha):
    # update the mle_estimate array to only store observable transcripts
    # add 1 to skip the out of gene bin
    observable_trans_indices = (numpy.array([
        -1,
    ] + f_mat.transcript_indices().tolist()) + 1)
    mle_estimate = mle_estimate[observable_trans_indices]

    if config.VERBOSE:
        config.log_statement("Estimating confidence bounds for gene %s" %
                             gene.id)

    #n_skipped = sum( 1 for x in sorted(f_mat.filtered_transcripts)
    #                 if x < trans_indices[0])
    # XXX Make sure that this is beign counted correctly
    #n_skipped_tmp = len(set(xrange(trans_indices[0])) - \
    #    set(x-1 for x in observable_trans_indices[1:] if x-1 < trans_indices[0]))
    #config.log_statement( str([n_skipped_tmp, n_skipped, f_mat.filtered_transcripts, \
    #    observable_trans_indices, trans_indices]), log=True)
    #assert n_skipped == n_skipped_tmp

    res = []
    while True:
        with cntr.get_lock():
            index = cntr.value
            if index == -1:
                break
            cntr.value -= 1

        trans_index, exp_mat_row, bnd_type = trans_indices[index]

        config.log_statement(
            "Estimating %s confidence bound for gene %s (%i/%i remain)" %
            (bnd_type, gene.id, cntr.value + 1, len(gene.transcripts)))
        try:
            p_value, bnd = frequency_estimation.estimate_confidence_bound(
                f_mat, num_reads_in_bams, exp_mat_row, mle_estimate, bnd_type,
                cb_alpha)
        except Exception, inst:
            p_value = 1.
            bnd = 0.0 if bnd_type == 'lb' else 1.0
            error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % (os.getpid(
            ), gene.id, gene.chrm, gene.strand, gene.start, gene.stop, inst)
            config.log_statement(error_msg, log=True)
            config.log_statement(traceback.format_exc(), log=True)

        if config.DEBUG_VERBOSE:
            config.log_statement(
                "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" %
                (bnd_type, gene.id, None, trans_index, len(
                    gene.transcripts), bnd, p_value))
        res.append((bnd_type, trans_index, bnd))
コード例 #8
0
def cluster_bins(expected_rnaseq_cnts):
    if config.DEBUG_VERBOSE:
        config.log_statement("Normalizing bin frequencies")

    clustered_bins = defaultdict(list)
    for bin, transcripts_and_cnts in expected_rnaseq_cnts.items():
        row = numpy.array(
            [x[1] for x in sorted(transcripts_and_cnts.iteritems())])
        key = tuple((100000 * row / row.sum()).round().tolist())
        clustered_bins[key].append(bin)
    return clustered_bins.values()
コード例 #9
0
ファイル: genes.py プロジェクト: bdgp/grit
def load_gene_bndry_bins(genes, contig, strand, contig_len):
    if config.VERBOSE:
        config.log_statement(
            "Loading gene boundaries from annotated genes in %s:%s" %
            (contig, strand))

    regions_graph = nx.Graph()
    for gene in genes:
        if gene.chrm != contig: continue
        if gene.strand != strand: continue
        regions = [tuple(x) for x in gene.find_transcribed_regions()]
        regions_graph.add_nodes_from(regions)
        regions_graph.add_edges_from(izip(regions[:-1], regions[1:]))

    # group overlapping regions
    all_regions = sorted(regions_graph.nodes())
    if len(all_regions) == 0: return []

    grpd_regions = [
        [],
    ]
    curr_start, curr_stop = all_regions[0]
    for x in all_regions:
        if x[0] < curr_stop:
            curr_stop = max(x[1], curr_stop)
            grpd_regions[-1].append(x)
        else:
            curr_start, curr_stop = x
            grpd_regions.append([
                x,
            ])
    # add edges for overlapping regions
    for grp in grpd_regions:
        regions_graph.add_edges_from(izip(grp[:-1], grp[1:]))

    # build gene objects with the intervals
    gene_bndry_bins = []
    for regions_cluster in nx.connected_components(regions_graph):
        gene_bin = GeneElements(contig, strand)
        regions = sorted(files.gtf.flatten(regions_cluster))
        for start, stop in regions:
            gene_bin.regions.append(
                SegmentBin(start, stop, [
                    "ESTART",
                ], [
                    "ESTOP",
                ], "GENE"))
        gene_bndry_bins.append(gene_bin)

    # XXX TODO expand gene boundaries
    # actually, it's probably better just to go through discovery

    return gene_bndry_bins
コード例 #10
0
def add_elements_for_contig_and_strand_worker(args_queue, elements,
                                              gene_id_cntr, output, gtf_ofp,
                                              tracking_ofp, fasta_fp,
                                              ref_genes):
    while True:
        args = args_queue.get()
        if args == 'FINISHED':
            config.log_statement("")
            return
        (contig, strand), grpd_exons = args
        add_elements_for_contig_and_strand(
            (contig, strand), grpd_exons, elements, gene_id_cntr, output,
            gtf_ofp, tracking_ofp, fasta_fp, ref_genes)
コード例 #11
0
def build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp, fasta_fp,
                             ref_genes):
    # if appropriate, open the fasta file
    if fasta_fp != None: fasta = Fastafile(fasta_fp.name)
    else: fasta = None
    while True:
        config.log_statement("Waiting for gene to process. (%i)" %
                             elements.qsize())
        gene_elements = elements.get()
        if gene_elements == 'FINISHED':
            config.log_statement("")
            return
        build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp,
                             fasta, ref_genes)
    return
コード例 #12
0
ファイル: frequency_estimation.py プロジェクト: bdgp/grit
def calc_max_feasible_step_size_and_limiting_index_BAD(x0, gradient):
    #Calculate the maximum step size to stay in the feasible region.
    #
    #solve y - x*gradient = MIN_TRANSCRIPT_FREQ for x
    #x = (y - MIN_TRANSCRIPT_FREQ)/gradient
    #
    # we use minus because we return a positive step
    try:
        steps = (x0 - MIN_TRANSCRIPT_FREQ) / (gradient + 1e-12)
        step_size = -steps[steps < 0].max()
        step_size_i = (steps == -step_size).nonzero()[0]
    except:
        config.log_statement("steps=" + steps)
        raise
    return step_size, step_size_i
コード例 #13
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
def add_elements_for_contig_and_strand_worker(
        args_queue, elements, gene_id_cntr,
        output, gtf_ofp, tracking_ofp, 
        fasta_fp, ref_genes):
    while True:
        args = args_queue.get()
        if args == 'FINISHED': 
            config.log_statement("")
            return
        (contig, strand), grpd_exons = args
        add_elements_for_contig_and_strand(
            (contig, strand), grpd_exons,
            elements, gene_id_cntr,
            output, gtf_ofp, tracking_ofp, 
            fasta_fp, ref_genes)
コード例 #14
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
def build_transcripts_worker( elements, 
                              output,
                              gtf_ofp, tracking_ofp,
                              fasta_fp, ref_genes ):
    # if appropriate, open the fasta file
    if fasta_fp != None: fasta = Fastafile(fasta_fp.name)
    else: fasta = None
    while True:
        config.log_statement("Waiting for gene to process. (%i)" % elements.qsize())
        gene_elements = elements.get()
        if gene_elements == 'FINISHED':
            config.log_statement("")
            return
        build_and_write_gene( gene_elements, output, 
                              gtf_ofp, tracking_ofp,
                              fasta, ref_genes)
    return
コード例 #15
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
def feed_elements(raw_elements, elements, 
                  output, gtf_ofp, tracking_ofp, 
                  fasta_fp, ref_genes ):
    all_args = multiprocessing.Queue()
    for (contig, strand), grpd_exons in raw_elements.iteritems():
        all_args.put([(contig, strand), dict(grpd_exons)])
    for i in xrange(config.NTHREADS):
        all_args.put('FINISHED')

    num_add_element_threads = min(len(raw_elements), config.NTHREADS)
    gene_id_cntr = multiprocessing.Value('i', 0)
    nthreads_remaining = multiprocessing.Value('i', num_add_element_threads)
    worker_args = [ all_args, elements, gene_id_cntr,
                    output, gtf_ofp, tracking_ofp, 
                    fasta_fp, ref_genes ]
    cluster_pids = []
    for i in xrange(num_add_element_threads):
        pid = os.fork()
        if pid == 0:
            add_elements_for_contig_and_strand_worker(*worker_args)
            with nthreads_remaining.get_lock():
                nthreads_remaining.value -= 1
                config.log_statement("Finished adding elements (%i left)" 
                                     % nthreads_remaining.value)
            build_transcripts_worker( elements, 
                                      output,
                                      gtf_ofp, tracking_ofp,
                                      fasta_fp, ref_genes )      
            os._exit(0)

        cluster_pids.append(pid)

    while True:
        with nthreads_remaining.get_lock():
            if nthreads_remaining.value == 0:
                for i in xrange(config.NTHREADS+1):
                    elements.put('FINISHED')
                break
        time.sleep(1.0)

    for pid in cluster_pids:
        os.waitpid(pid, 0) 
    
    config.log_statement("Finished adding elements")
    return
コード例 #16
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
def build_and_write_gene(gene_elements, output,
                         gtf_ofp, tracking_ofp,
                         fasta, ref_genes ):
    # build the gene with transcripts, and optionally call orfs
    start = min(x[0] for x in chain(
        gene_elements.tss_exons, gene_elements.tes_exons,
        gene_elements.promoter, gene_elements.polyas))
    stop = max(x[1] for x in chain(
        gene_elements.tss_exons, gene_elements.tes_exons,
        gene_elements.promoter, gene_elements.polyas))
    try:
        config.log_statement(
            "Building transcripts and ORFs for %s (%s:%s:%i-%i)" % (
                gene_elements.id, gene_elements.chrm, gene_elements.strand, 
                start, stop) )
        
        gene = build_gene(gene_elements, fasta, ref_genes)
        if gene == None: 
            return
        config.log_statement(
            "FINISHED Building transcript and ORFs for Gene %s" % gene.id)

        # dump a pickle of the gene to a temp file, and set that in the 
        # output manager
        ofname = gene.write_to_file(
            config.get_gene_tmp_fname(gene.id, SAMPLE_TYPE, REP_ID))
        
        output.put((gene.id, len(gene.transcripts), ofname))
        write_gene_to_gtf(gtf_ofp, gene)
        write_gene_to_tracking_file(tracking_ofp, gene)
    except TooManyCandidateTranscriptsError:
        config.log_statement(
            "Too many candidate transcripts in %s(%s:%s:%i-%i)" % (
                gene_elements.id, gene_elements.chrm, gene_elements.strand, 
                start, stop), 
            log=True)
        return
    except Exception, inst:
        config.log_statement(
            "ERROR building transcript in %s(%s:%s:%i-%i): %s" % (
                gene_elements.id, gene_elements.chrm, gene_elements.strand, 
                start, stop, inst), 
            log=True)
        if config.DEBUG_VERBOSE:
            config.log_statement( traceback.format_exc(), log=True )
コード例 #17
0
ファイル: genes.py プロジェクト: nboley/grit
def load_gene_bndry_bins( genes, contig, strand, contig_len ):  
    if config.VERBOSE:
        config.log_statement( 
            "Loading gene boundaries from annotated genes in %s:%s" % (  
                contig, strand) )  
  
    regions_graph = nx.Graph()
    for gene in genes:
        if gene.chrm != contig: continue  
        if gene.strand != strand: continue
        regions = [tuple(x) for x in gene.find_transcribed_regions()]
        regions_graph.add_nodes_from(regions)
        regions_graph.add_edges_from(izip(regions[:-1], regions[1:]))

    # group overlapping regions
    all_regions = sorted(regions_graph.nodes())
    if len(all_regions) == 0: return []  

    grpd_regions = [[],]
    curr_start, curr_stop = all_regions[0]
    for x in all_regions:
        if x[0] < curr_stop:
            curr_stop = max(x[1], curr_stop)
            grpd_regions[-1].append(x)
        else:
            curr_start, curr_stop = x
            grpd_regions.append([x,])
    # add edges for overlapping regions
    for grp in grpd_regions:
        regions_graph.add_edges_from(izip(grp[:-1], grp[1:]))

    # build gene objects with the intervals  
    gene_bndry_bins = []  
    for regions_cluster in nx.connected_components(regions_graph):
        gene_bin = GeneElements( contig, strand )
        regions = sorted(files.gtf.flatten(regions_cluster))
        for start, stop in regions:
            gene_bin.regions.append(
                SegmentBin(start, stop, ["ESTART",], ["ESTOP",], "GENE"))
        gene_bndry_bins.append( gene_bin )  

    # XXX TODO expand gene boundaries
    # actually, it's probably better just to go through discovery
    
    return gene_bndry_bins
コード例 #18
0
def build_and_write_gene(gene_elements, output, gtf_ofp, tracking_ofp, fasta,
                         ref_genes):
    # build the gene with transcripts, and optionally call orfs
    start = min(
        x[0] for x in chain(gene_elements.tss_exons, gene_elements.tes_exons,
                            gene_elements.promoter, gene_elements.polyas))
    stop = max(x[1]
               for x in chain(gene_elements.tss_exons, gene_elements.tes_exons,
                              gene_elements.promoter, gene_elements.polyas))
    try:
        config.log_statement(
            "Building transcripts and ORFs for %s (%s:%s:%i-%i)" %
            (gene_elements.id, gene_elements.chrm, gene_elements.strand, start,
             stop))

        gene = build_gene(gene_elements, fasta, ref_genes)
        if gene == None:
            return
        config.log_statement(
            "FINISHED Building transcript and ORFs for Gene %s" % gene.id)

        # dump a pickle of the gene to a temp file, and set that in the
        # output manager
        ofname = gene.write_to_file(
            config.get_gene_tmp_fname(gene.id, SAMPLE_TYPE, REP_ID))

        output.put((gene.id, len(gene.transcripts), ofname))
        write_gene_to_gtf(gtf_ofp, gene)
        write_gene_to_tracking_file(tracking_ofp, gene)
    except TooManyCandidateTranscriptsError:
        config.log_statement(
            "Too many candidate transcripts in %s(%s:%s:%i-%i)" %
            (gene_elements.id, gene_elements.chrm, gene_elements.strand, start,
             stop),
            log=True)
        return
    except Exception, inst:
        config.log_statement(
            "ERROR building transcript in %s(%s:%s:%i-%i): %s" %
            (gene_elements.id, gene_elements.chrm, gene_elements.strand, start,
             stop, inst),
            log=True)
        if config.DEBUG_VERBOSE:
            config.log_statement(traceback.format_exc(), log=True)
コード例 #19
0
def feed_elements(raw_elements, elements, output, gtf_ofp, tracking_ofp,
                  fasta_fp, ref_genes):
    all_args = multiprocessing.Queue()
    for (contig, strand), grpd_exons in raw_elements.iteritems():
        all_args.put([(contig, strand), dict(grpd_exons)])
    for i in xrange(config.NTHREADS):
        all_args.put('FINISHED')

    num_add_element_threads = min(len(raw_elements), config.NTHREADS)
    gene_id_cntr = multiprocessing.Value('i', 0)
    nthreads_remaining = multiprocessing.Value('i', num_add_element_threads)
    worker_args = [
        all_args, elements, gene_id_cntr, output, gtf_ofp, tracking_ofp,
        fasta_fp, ref_genes
    ]
    cluster_pids = []
    for i in xrange(num_add_element_threads):
        pid = os.fork()
        if pid == 0:
            add_elements_for_contig_and_strand_worker(*worker_args)
            with nthreads_remaining.get_lock():
                nthreads_remaining.value -= 1
                config.log_statement("Finished adding elements (%i left)" %
                                     nthreads_remaining.value)
            build_transcripts_worker(elements, output, gtf_ofp, tracking_ofp,
                                     fasta_fp, ref_genes)
            os._exit(0)

        cluster_pids.append(pid)

    while True:
        with nthreads_remaining.get_lock():
            if nthreads_remaining.value == 0:
                for i in xrange(config.NTHREADS + 1):
                    elements.put('FINISHED')
                break
        time.sleep(1.0)

    for pid in cluster_pids:
        os.waitpid(pid, 0)

    config.log_statement("Finished adding elements")
    return
コード例 #20
0
ファイル: peaks.py プロジェクト: nboley/grit
def estimate_read_and_control_cov_in_gene(
        gene, signal_reads, reads_type, 
        rnaseq_reads, alpha=0.01):
    assert reads_type in ('promoter', 'polya')
    reads_type = '5p' if reads_type == 'promoter' else '3p'
    if gene.strand == '-': 
        reads_type = {'3p':'5p', '5p':'3p'}[reads_type]
    
    signal_cov = gene.find_coverage(signal_reads)    
    if DEBUG_VERBOSE:
        config.log_statement("Finished building signal coverage array")
    #signal_cov = build_false_signal(rnaseq_reads, '5p')
    
    control_cov = build_control_in_gene_regions(
        gene, rnaseq_reads, reads_type, SMOOTH_WIN_LEN)
    if DEBUG_VERBOSE:
        config.log_statement("Finished building control coverage array")
    
    return signal_cov, control_cov
コード例 #21
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def get_elements_from_gene( gene, get_tss=True, get_jns=True, \
                                get_tes=True, get_exons=False ):
    tss_exons = set()
    tes_exons = set()
    introns = set()
    exons = set()
    
    chrm, strand = clean_chr_name(gene.chrm), gene.strand
    transcripts = gene.transcripts
    
    for trans in transcripts:
        bndries = trans.exon_bnds

        fp_region = GenomicInterval(chrm, strand, bndries[0], bndries[1])
        tp_region = GenomicInterval(chrm, strand, bndries[-2], bndries[-1])
        if strand == '+':
            if get_tss:
                tss_exons.add( fp_region )
            if get_tes:
                tes_exons.add( tp_region )
        else:
            if strand != '-':
                config.log_statement("BADBADBAD", strand)
                continue
            assert strand == '-'
            if get_tss:
                tss_exons.add( tp_region )
            if get_tes:
                tes_exons.add( fp_region )
        
        if get_jns:
            for start, stop in izip( bndries[1:-2:2], bndries[2:-1:2] ):
                # add and subtract 1 to ge tthe inclusive intron boundaries,
                # rather than the exon boundaries
                if start >= stop:
                    continue
                introns.add( GenomicInterval(chrm, strand, start+1, stop-1) )

        if get_exons:
            for start, stop in izip( bndries[::2], bndries[1::2] ):
                exons.add( GenomicInterval(chrm, strand, start, stop) )
    
    return tss_exons, introns, tes_exons, exons
コード例 #22
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def remove_overlapping_elements( tes_dict, elements_I, w ):
    '''
    Remove all elements (tes's) overlapping another element type
    '''
    start = w
    end = w+1
    over = dict()
    for (chrm,strand) in tes_dict.keys():
        if not elements_I.has_key((chrm,strand)):
            config.log_statement(
                "warning, element_intersecter does not contain the chrm: ", chrm)
            continue
        for tes in tes_dict[(chrm,strand)].keys():
            H = elements_I[(chrm,strand)].find(tes-start,tes+end)
            if not H:
                if not over.has_key( (chrm,strand) ):
                    over[ (chrm,strand) ] = dict()
                over[ (chrm,strand) ][tes] = copy.deepcopy( 
                    tes_dict[ (chrm,strand) ][tes] )
    return over
コード例 #23
0
ファイル: peaks.py プロジェクト: bdgp/grit
def estimate_read_and_control_cov_in_gene(gene,
                                          signal_reads,
                                          reads_type,
                                          rnaseq_reads,
                                          alpha=0.01):
    assert reads_type in ('promoter', 'polya')
    reads_type = '5p' if reads_type == 'promoter' else '3p'
    if gene.strand == '-':
        reads_type = {'3p': '5p', '5p': '3p'}[reads_type]

    signal_cov = gene.find_coverage(signal_reads)
    if DEBUG_VERBOSE:
        config.log_statement("Finished building signal coverage array")
    #signal_cov = build_false_signal(rnaseq_reads, '5p')

    control_cov = build_control_in_gene_regions(gene, rnaseq_reads, reads_type,
                                                SMOOTH_WIN_LEN)
    if DEBUG_VERBOSE:
        config.log_statement("Finished building control coverage array")

    return signal_cov, control_cov
コード例 #24
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def parse_fasta( fn ):
    '''
    load a fasta file into a dictionary pointing to sinlge strings, one for
    each chromosome
    '''
    genome = dict()
    fid = open(fn)
    chrm = ''
    for line in fid:
        data = line.strip()
        if data.startswith('>'):
            chrm = clean_chr_name(data[1:])
        else:
            if not genome.has_key(chrm):
                genome[chrm] = []
                config.log_statement(chrm)
            genome[chrm].append(data.lower())
    for chrm in genome.keys():
        genome[chrm] = ''.join(genome[chrm])
    fid.close()
    return genome
コード例 #25
0
    def get_new_gene():
        
        # get a gene to process
        try: gene_id = gene_ids.get(timeout=0.1)
        except Queue.Empty: 
            assert gene_ids.qsize() == 0
            config.log_statement("")
            raise IndexError, "No genes left"
        
        config.log_statement(
            "Loading design matrix for gene '%s'" % gene_id)

        gene = data.get_gene(gene_id)
        try: 
            f_mat = data.get_design_matrix(gene_id)
        except NoDesignMatrixError:
            if config.DEBUG_VERBOSE:
                config.log_statement("No design matrix for '%s'" % gene_id, 
                                     log=True)
            raise

        mle_estimate = data.get_mle(gene_id)
        
        trans_indices = []
        for row_num, t_index in enumerate(f_mat.transcript_indices()):
            trans_indices.append((t_index, row_num+1, bnd_type))

        cntr = trans_index_cntrs[gene_id]
        with cntr.get_lock():
            if cntr.value == -1000: 
                cntr.value = len(trans_indices)-1
        
        return gene, f_mat, mle_estimate, trans_indices, cntr
コード例 #26
0
def nnls_cvxopt(X, Y, fixed_indices_and_values={}):
    from cvxopt import matrix, solvers
    X = matrix(X)
    Y = matrix(Y)

    m, n = X.size
    num_constraint = len(fixed_indices_and_values)

    G = matrix(0.0, (n, n))
    G[::n + 1] = -1.0
    h = matrix(-MIN_TRANSCRIPT_FREQ, (n, 1))

    # Add the equality constraints
    A = matrix(0., (1 + num_constraint, n))
    b = matrix(0., (1 + num_constraint, 1))

    # Add the sum to one constraint
    A[0, :] = 1.
    b[0, 0] = 1.

    # Add the fixed value constraints
    for const_i, (i, val) in enumerate(fixed_indices_and_values.iteritems()):
        A[const_i + 1, i] = 1.
        b[const_i + 1, 0] = val

    solvers.options['show_progress'] = DEBUG_OPTIMIZATION
    res = solvers.qp(P=X.T * X, q=-X.T * Y, G=G, h=h, A=A, b=b)
    x = numpy.array(res['x']).T[0, ]
    rss = ((numpy.array(X * res['x'] - Y)[0, ])**2).sum()

    if DEBUG_OPTIMIZATION:
        for key, val in res.iteritems():
            if key in 'syxz': continue
            config.log_statement("%s:\t%s" % (key.ljust(22), val))

        config.log_statement("RSS: ".ljust(22) + str(rss))

    x[x < MIN_TRANSCRIPT_FREQ] = MIN_TRANSCRIPT_FREQ
    x = project_onto_simplex(x)
    return x
コード例 #27
0
def estimate_confidence_bounds( data, bnd_type ):
    config.log_statement(
        "Populating estimate confidence bounds queue.")

    ## populate the queue
    # sort so that the biggest genes are processed first
    gene_ids = multiprocessing.Queue()
    trans_index_cntrs = {}
    sorted_gene_ids = sorted(data.gene_ids, 
                             key=lambda x:data.gene_ntranscripts_mapping[x],
                             reverse=True)
    for i, gene_id in enumerate(sorted_gene_ids):
        gene_ids.put(gene_id)
        trans_index_cntrs[gene_id] = multiprocessing.Value( 'i', -1000)
    
    config.log_statement("Waiting on gene bounds children")

    if False and config.NTHREADS == 1:
        find_confidence_bounds_worker( 
            data, gene_ids, trans_indices_queues, bnd_type )
    else:
        pids = []
        for i in xrange(config.NTHREADS):
            pid = os.fork()
            if pid == 0:
                try: 
                    find_confidence_bounds_worker(
                        data, gene_ids, 
                        trans_index_cntrs, bnd_type)
                except Exception, inst:
                    config.log_statement( traceback.format_exc(), log=True )
                finally:
                    os._exit(0)
コード例 #28
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def extract_genome_sequence( genome, tes_dict, w ):
    '''
    Return an array of sequences each of size 2*w + 1 
    '''
    seqs = []
    start = w
    end = w+1
    for (chrm,strand) in tes_dict.keys():
        if not genome.has_key(chrm):
            config.log_statement(
                "warning, genome sequence does not contain the chrm: ", chrm)
            continue
        for tes in tes_dict[(chrm,strand)].keys():
            seq = genome[chrm][tes-start:tes+end]
            if strand == "-":
                seqs.append([[chrm,strand,tes,tes_dict[(chrm,strand)][tes]], 
                             reverse_strand(seq)])
            else:
                assert strand == "+"
                seqs.append([[chrm,strand,tes,tes_dict[(chrm,strand)][tes]], 
                             seq])
    return seqs
コード例 #29
0
def write_data_to_tracking_file(data, fl_dists, ofp):
    num_reads_in_bams = data.get_num_reads_in_bams()
    ofp.write("\t".join(
            ["tracking_id", "gene_id ",
             "coverage", "FPKM    ",
             "FPKM_lo ", "FPKM_hi ", "status"] 
            ) + "\n")

    try: 
        sorted_gene_ids = sorted(
            data.gene_ids, key=lambda x: int(x.split("_")[-1]))
    except:
        sorted_gene_ids = data.gene_ids

    for gene_id in sorted_gene_ids:
        try: 
            lines = build_gene_lines_for_tracking_file(
                gene_id, data, num_reads_in_bams, fl_dists)
        except Exception, inst:
            config.log_statement("Skipping '%s': %s" % (gene_id, str(inst)))
            config.log_statement( traceback.format_exc(), log=True )
        else:
            ofp.write("\n".join(lines) + "\n" )
コード例 #30
0
ファイル: fit_forests_human_polyA.py プロジェクト: bdgp/grit
def get_RNAseq_density_worker( reads, sites, sites_lock, dense ):
    while True:
        with sites_lock:
            sites_len = len( sites )
            if sites_len == 0: break
            # using the commented out code appears slower because
            # some regions ( like M ) have so many reads, that them
            # all getting stuck in 1 group outweighs the lock overhead
            # of doing 1 at a time. A random sort might fix this, but it
            # seems fast enough as is. 
            args = [sites.pop(),] #[-1:]
            #del sites[-1:]
        if DEBUG_VERBOSE and sites_len%1000 == 0:
            config.log_statement("%i polyA sites remain" % sites_len)
        for chrm, strand, pos, cnt in args:
            key = '_'.join([chrm,strand,str(pos)])
            predictors = get_predictors_for_polya_site( 
                reads, chrm, strand, pos )
            if not dense.has_key(key):
                dense[key] = predictors
            else:
                dense[key] = dense[key] + predictors

    return
コード例 #31
0
def estimate_mles( data ):
    config.log_statement("Initializing MLE queue")

    gene_ids = multiprocessing.Queue()
    sorted_gene_ids = sorted(data.gene_ids, 
                             key=lambda x:data.gene_ntranscripts_mapping[x],
                             reverse=True)
    # sort so that the biggest genes are processed first
    
    args = [ gene_ids, data ]
    if False and config.NTHREADS == 1:
        estimate_mle_worker(*args)
    else:
        ps = []
        for i in xrange(config.NTHREADS):
            pid = os.fork()
            if pid == 0:
                try:
                    estimate_mle_worker(*args)
                except Exception, inst:
                    config.log_statement( str(error_msg), log=True )
                    config.log_statement( traceback.format_exc(), log=True )
                finally:
                    os._exit(0)
コード例 #32
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
def build_transcripts(exons_bed_fp, gtf_ofname, tracking_ofname, 
                      fasta_fp=None, ref_genes=None,
                      sample_type=None, rep_id=None):
    """Build transcripts
    """    
    # set the sample ype and rep id for the output tmp file names
    global SAMPLE_TYPE
    SAMPLE_TYPE = sample_type
    global REP_ID
    REP_ID = rep_id
    
    # make sure that we're starting from the start of the 
    # elements files
    config.log_statement( "Loading %s" % exons_bed_fp.name, log=True )
    exons_bed_fp.seek(0)
    raw_elements = load_elements( exons_bed_fp )
    config.log_statement( "Finished Loading %s" % exons_bed_fp.name )
    
    gtf_ofp = ThreadSafeFile(gtf_ofname + ".unfinished", "w")
    gtf_ofp.write("track name=%s useScore=1\n" 
                  % ".".join(gtf_ofname.split(".")[:-1]))
    
    tracking_ofp = ThreadSafeFile(tracking_ofname + ".unfinished", "w")
    tracking_ofp.write("\t".join(
            ["tracking_id".ljust(20), 
             "class_code", 
             "nearest_ref_id".ljust(20), 
             "gene_id".ljust(20), 
             "gene_short_name".ljust(20), 
             "tss_id".ljust(10), 
             "locus".ljust(30), 
             "length"]) + "\n")
    
    config.log_statement( "Building Transcripts", log=True )
    manager = multiprocessing.Manager()
    elements = manager.Queue(2*config.NTHREADS)
    output = manager.Queue()

    transcript_building_children_args = [
        elements, output, 
        gtf_ofp, tracking_ofp,
        fasta_fp, ref_genes]

    
    pids = []
    for i in xrange(max(0,config.NTHREADS - len(raw_elements))):
        pid = os.fork()
        if pid == 0:
            build_transcripts_worker(elements, 
                                     output,
                                     gtf_ofp, tracking_ofp,
                                     fasta_fp, ref_genes)
            os._exit(0)
        pids.append(pid)

    elements_feeder_pid = os.fork()
    if elements_feeder_pid == 0:
        feed_elements( raw_elements, elements, 
                       output, gtf_ofp, tracking_ofp, 
                       fasta_fp, ref_genes )
        os._exit(0)

    for pid in pids:
        os.waitpid(pid, 0) 

    os.waitpid(elements_feeder_pid, 0)
    
    genes = []
    while output.qsize() > 0:
        try: 
            genes.append(output.get_nowait())
        except Queue.Empty: 
            continue
    
    assert len(genes) == len(set(genes))
    config.log_statement("Finished building transcripts")

    gtf_ofp.close()
    tracking_ofp.close()

    # we store to unfinished so we know if it errors out early
    shutil.move(gtf_ofname + ".unfinished", gtf_ofname)
    shutil.move(tracking_ofname + ".unfinished", tracking_ofname)

    manager.shutdown()
    
    return genes
コード例 #33
0
def estimate_mle_worker( gene_ids, data ):
    while True:
        config.log_statement("Retrieving gene from queue")
        gene_id = gene_ids.get()
        if gene_id == 'FINISHED': 
            config.log_statement("")
            return
        
        try:
            config.log_statement(
                "Loading gene %s" % gene_id )
            gene = data.get_gene(gene_id)
              
            config.log_statement(
                "Finding MLE for Gene %s(%s:%s:%i-%i) - %i transcripts" \
                    % (gene.id, gene.chrm, gene.strand, 
                       gene.start, gene.stop, len(gene.transcripts) ) )
            
            try: 
                f_mat = data.get_design_matrix(gene_id)
            except NoDesignMatrixError:
                if config.DEBUG_VERBOSE:
                    config.log_statement("No design matrix for '%s'" % gene_id, 
                                         log=True)
                continue
            num_reads_in_bams = data.get_num_reads_in_bams()
            expected_array, observed_array = f_mat.expected_and_observed(
                num_reads_in_bams)
            if (expected_array, observed_array) == (None, None): 
                continue
            mle = frequency_estimation.estimate_transcript_frequencies( 
                observed_array, expected_array)
        except Exception, inst:
            error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % (
                os.getpid(), gene.id, 
                gene.chrm, gene.strand, gene.start, gene.stop, inst)
            config.log_statement( error_msg, log=True )
            config.log_statement( traceback.format_exc(), log=True )
            continue

        log_lhd = frequency_estimation.calc_lhd( 
            mle, observed_array, expected_array)

        # add back in the missing trasncripts
        full_mle = -1*numpy.ones(len(gene.transcripts)+1, dtype=float)
        full_mle[numpy.array([-1,]+f_mat.transcript_indices().tolist())+1] = mle
        
        data.set_mle(gene, full_mle)
        config.log_statement( "FINISHED MLE %s\t%.2f - updating queues" % ( 
                gene.id, log_lhd ) )
コード例 #34
0
def find_confidence_bounds_worker( 
        data, gene_ids, trans_index_cntrs, bnd_type ):
    def get_new_gene():
        
        # get a gene to process
        try: gene_id = gene_ids.get(timeout=0.1)
        except Queue.Empty: 
            assert gene_ids.qsize() == 0
            config.log_statement("")
            raise IndexError, "No genes left"
        
        config.log_statement(
            "Loading design matrix for gene '%s'" % gene_id)

        gene = data.get_gene(gene_id)
        try: 
            f_mat = data.get_design_matrix(gene_id)
        except NoDesignMatrixError:
            if config.DEBUG_VERBOSE:
                config.log_statement("No design matrix for '%s'" % gene_id, 
                                     log=True)
            raise

        mle_estimate = data.get_mle(gene_id)
        
        trans_indices = []
        for row_num, t_index in enumerate(f_mat.transcript_indices()):
            trans_indices.append((t_index, row_num+1, bnd_type))

        cntr = trans_index_cntrs[gene_id]
        with cntr.get_lock():
            if cntr.value == -1000: 
                cntr.value = len(trans_indices)-1
        
        return gene, f_mat, mle_estimate, trans_indices, cntr
    
    def get_gene_being_processed():
        longest_gene_id = None
        gene_len = 0
        for gene_id, cntr in trans_index_cntrs.iteritems():
            value = cntr.value
            if value > gene_len:
                longest_gene_id = gene_id
                gene_len = value
        
        if longest_gene_id == None: 
            return None
        
        gene = data.get_gene(longest_gene_id)
        f_mat = data.get_design_matrix(longest_gene_id)
        mle_estimate = data.get_mle(longest_gene_id)

        trans_indices = []
        for row_num, t_index in enumerate(f_mat.transcript_indices()):
            trans_indices.append((t_index, row_num+1, bnd_type))
        
        return ( gene, f_mat, mle_estimate, 
                 trans_indices, trans_index_cntrs[longest_gene_id] )
    
    no_new_genes = False    
    num_reads_in_bams = data.get_num_reads_in_bams()
    while True:
        try:
            try: 
                gene, f_mat, mle_estimate, trans_indices, cntr = get_new_gene()
            except NoDesignMatrixError:
                continue
            except IndexError: 
                res = get_gene_being_processed()
                if res == None: 
                    break
                gene, f_mat, mle_estimate, trans_indices, cntr = res
            
            cbs = find_confidence_bounds_in_gene( 
                gene, num_reads_in_bams,
                f_mat, mle_estimate, 
                trans_indices, cntr,
                cb_alpha=config.CB_SIG_LEVEL)
            data.set_cbs(gene.id, cbs)
            
            if config.VERBOSE:
                config.log_statement("Finished processing '%s'" % gene.id)
        except Exception, inst:
            config.log_statement( traceback.format_exc(), log=True )
コード例 #35
0
ファイル: peaks.py プロジェクト: nboley/grit
def call_peaks( signal_cov, original_control_cov, reads_type,
                gene,
                alpha, min_noise_frac, 
                min_merge_size, min_rel_merge_size,
                min_rd_cnt,
                trim_fraction,
                min_peak_size, max_peak_size,
                max_exp_sum_fraction, max_exp_mean_cvg_fraction):
    signal = numpy.ones(len(signal_cov))
    for k in xrange(N_REPS):
        noise_frac = 1.0
        noise_regions = [(0, len(signal)),]
        reg_coef, control_cov = \
            update_control_cov_for_five_prime_bias(
                noise_regions, noise_frac, 
                signal_cov, original_control_cov, reads_type)
        for i in xrange(MAX_NUM_ITERATIONS):
            if DEBUG_VERBOSE: 
                region = {'chrm': gene.chrm, 'strand': gene.strand, 
                          'start': gene.start, 'stop': gene.stop}
                write_bedgraph_from_array(
                    1000*control_cov, region, "control.%i"%i)
                write_bedgraph_from_array(
                    signal_cov, region, "signal.%i"%i)
                config.log_statement(
                    "Iter %i: Noise Frac %.2f%%\tReg Coef: %s" % (
                        i+1, noise_frac*100, reg_coef))
            noise_regions = find_noise_regions(
                signal_cov, control_cov, 
                noise_frac, alpha=alpha, min_peak_size=min_peak_size )
            new_noise_frac = estimate_noise_frac(
                noise_regions, signal_cov, control_cov, min_noise_frac)
            new_reg_coef, control_cov = \
                update_control_cov_for_five_prime_bias(
                    noise_regions, noise_frac, 
                    signal_cov, original_control_cov, reads_type)
            if noise_frac - new_noise_frac <= 1e-3 \
                    and abs(reg_coef[0] - new_reg_coef[0]) < 1e-3 \
                    and abs(reg_coef[1] - new_reg_coef[1]) < 1e-3: 
                break
            else: 
                noise_frac = new_noise_frac
                reg_coef = new_reg_coef
        
        for start, stop in noise_regions: 
            signal[start:stop] -= 1./N_REPS
    
    # build a list of inclusive peak starts and stops
    peaks = []
    nonzero_bases = (signal>1e-6).nonzero()[0].tolist()
    if len(nonzero_bases) == 0: return peaks
    curr_start = nonzero_bases.pop(0)
    curr_stop = curr_start
    for base in nonzero_bases:
        if base == curr_stop+1: 
            curr_stop += 1
        else:
            peaks.append((curr_start, curr_stop))
            curr_start, curr_stop = base, base
    
    peaks.append((curr_start, curr_stop))
    while True:
        new_peaks = merge_adjacent_intervals(
            peaks, min_merge_size, min_rel_merge_size, max_peak_size)
        if len(new_peaks) == len(peaks):
            peaks = new_peaks
            break
        else:
            peaks = new_peaks

    # trim peaks
    new_peaks = []
    for start, stop in peaks:
        assert stop >= start
        cov_region = signal_cov[start:stop+1]
        total_cov = cov_region.sum()
        cov_cumsum = cov_region.cumsum()-cov_region[0]
        try: trim_start = numpy.flatnonzero(
                cov_cumsum < int(trim_fraction*total_cov)).max()
        except:
            trim_start = 0
        try: trim_stop = numpy.flatnonzero(
                cov_cumsum > (1.0-trim_fraction)*total_cov).min()
        except: trim_stop=len(cov_region)-1
        while trim_start < len(cov_region)-1 and cov_region[trim_start] == 0:
            trim_start += 1
        while trim_stop > trim_start and cov_region[trim_stop] == 0:
            trim_stop -= 1
        new_peaks.append((trim_start+start, 
                          trim_stop+start,
                          cov_region[trim_start:trim_stop+1].sum()))
    
    # filter peaks
    exp_filtered_peaks = []
    max_peak_cnt = float(max(cnt for start, stop, cnt in new_peaks))
    max_peak_mean_cnt = float(max(cnt/float(stop-start+1) 
                                  for start, stop, cnt in new_peaks))
    for start, stop, cnt in new_peaks:
        length = stop - start + 1
        if (cnt >= min_rd_cnt
            and length >= min_peak_size
            and length <= max_peak_size
            and cnt/max_peak_cnt > max_exp_sum_fraction
            and (cnt/float(length))/max_peak_mean_cnt 
                > max_exp_mean_cvg_fraction ): 
            exp_filtered_peaks.append((start, stop, cnt))

    return exp_filtered_peaks
コード例 #36
0
def find_confidence_bounds_in_gene( gene, num_reads_in_bams,
                                    f_mat, mle_estimate, 
                                    trans_indices, cntr,
                                    cb_alpha):
    # update the mle_estimate array to only store observable transcripts
    # add 1 to skip the out of gene bin
    observable_trans_indices = (
        numpy.array([-1,] + f_mat.transcript_indices().tolist())+1 )
    mle_estimate = mle_estimate[observable_trans_indices]

    if config.VERBOSE:
        config.log_statement( 
            "Estimating confidence bounds for gene %s" % gene.id )
    
    #n_skipped = sum( 1 for x in sorted(f_mat.filtered_transcripts)
    #                 if x < trans_indices[0])
    # XXX Make sure that this is beign counted correctly
    #n_skipped_tmp = len(set(xrange(trans_indices[0])) - \
    #    set(x-1 for x in observable_trans_indices[1:] if x-1 < trans_indices[0]))
    #config.log_statement( str([n_skipped_tmp, n_skipped, f_mat.filtered_transcripts, \
    #    observable_trans_indices, trans_indices]), log=True)
    #assert n_skipped == n_skipped_tmp
    
    res = []
    while True:
        with cntr.get_lock():
            index = cntr.value
            if index == -1: 
                config.log_statement('')
                break
            cntr.value -= 1
                
        trans_index, exp_mat_row, bnd_type = trans_indices[index]
        
        config.log_statement( 
            "Estimating %s confidence bound for gene %s (%i/%i remain)" % ( 
                bnd_type, gene.id, cntr.value+1, len(gene.transcripts)))
        try:
            p_value, bnd = frequency_estimation.estimate_confidence_bound( 
                f_mat, num_reads_in_bams,
                exp_mat_row, mle_estimate, bnd_type, cb_alpha )
        except Exception, inst:
            p_value = 1.
            bnd = 0.0 if bnd_type == 'lb' else 1.0
            error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % (
                os.getpid(), gene.id, 
                gene.chrm, gene.strand, gene.start, gene.stop, inst)
            config.log_statement( error_msg, log=True )
            config.log_statement( traceback.format_exc(), log=True )
        
        if config.DEBUG_VERBOSE: config.log_statement( 
            "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" % (
            bnd_type, gene.id, None, 
            trans_index, len(gene.transcripts), 
            bnd, p_value ) )
        res.append((bnd_type, trans_index, bnd))
コード例 #37
0
            bnd = 0.0 if bnd_type == 'lb' else 1.0
            error_msg = "%i: Skipping %s (%s:%s:%i-%i): %s" % (
                os.getpid(), gene.id, 
                gene.chrm, gene.strand, gene.start, gene.stop, inst)
            config.log_statement( error_msg, log=True )
            config.log_statement( traceback.format_exc(), log=True )
        
        if config.DEBUG_VERBOSE: config.log_statement( 
            "FINISHED %s BOUND %s\t%s\t%i/%i\t%.2e\t%.2e" % (
            bnd_type, gene.id, None, 
            trans_index, len(gene.transcripts), 
            bnd, p_value ) )
        res.append((bnd_type, trans_index, bnd))

    if config.VERBOSE:
        config.log_statement( 
            "FINISHED Estimating confidence bound for gene %s" % gene.id )
    
    return res

def find_confidence_bounds_worker( 
        data, gene_ids, trans_index_cntrs, bnd_type ):
    def get_new_gene():
        
        # get a gene to process
        try: gene_id = gene_ids.get(timeout=0.1)
        except Queue.Empty: 
            assert gene_ids.qsize() == 0
            config.log_statement("")
            raise IndexError, "No genes left"
        
        config.log_statement(
コード例 #38
0
ファイル: peaks.py プロジェクト: bdgp/grit
def call_peaks(signal_cov, original_control_cov, reads_type, gene, alpha,
               min_noise_frac, min_merge_size, min_rel_merge_size, min_rd_cnt,
               trim_fraction, min_peak_size, max_peak_size,
               max_exp_sum_fraction, max_exp_mean_cvg_fraction):
    signal = numpy.ones(len(signal_cov))
    for k in xrange(N_REPS):
        noise_frac = 1.0
        noise_regions = [
            (0, len(signal)),
        ]
        reg_coef, control_cov = \
            update_control_cov_for_five_prime_bias(
                noise_regions, noise_frac,
                signal_cov, original_control_cov, reads_type)
        for i in xrange(MAX_NUM_ITERATIONS):
            if DEBUG_VERBOSE:
                region = {
                    'chrm': gene.chrm,
                    'strand': gene.strand,
                    'start': gene.start,
                    'stop': gene.stop
                }
                write_bedgraph_from_array(1000 * control_cov, region,
                                          "control.%i" % i)
                write_bedgraph_from_array(signal_cov, region, "signal.%i" % i)
                config.log_statement(
                    "Iter %i: Noise Frac %.2f%%\tReg Coef: %s" %
                    (i + 1, noise_frac * 100, reg_coef))
            noise_regions = find_noise_regions(signal_cov,
                                               control_cov,
                                               noise_frac,
                                               alpha=alpha,
                                               min_peak_size=min_peak_size)
            new_noise_frac = estimate_noise_frac(noise_regions, signal_cov,
                                                 control_cov, min_noise_frac)
            new_reg_coef, control_cov = \
                update_control_cov_for_five_prime_bias(
                    noise_regions, noise_frac,
                    signal_cov, original_control_cov, reads_type)
            if noise_frac - new_noise_frac <= 1e-3 \
                    and abs(reg_coef[0] - new_reg_coef[0]) < 1e-3 \
                    and abs(reg_coef[1] - new_reg_coef[1]) < 1e-3:
                break
            else:
                noise_frac = new_noise_frac
                reg_coef = new_reg_coef

        for start, stop in noise_regions:
            signal[start:stop] -= 1. / N_REPS

    # build a list of inclusive peak starts and stops
    peaks = []
    nonzero_bases = (signal > 1e-6).nonzero()[0].tolist()
    if len(nonzero_bases) == 0: return peaks
    curr_start = nonzero_bases.pop(0)
    curr_stop = curr_start
    for base in nonzero_bases:
        if base == curr_stop + 1:
            curr_stop += 1
        else:
            peaks.append((curr_start, curr_stop))
            curr_start, curr_stop = base, base

    peaks.append((curr_start, curr_stop))
    while True:
        new_peaks = merge_adjacent_intervals(peaks, min_merge_size,
                                             min_rel_merge_size, max_peak_size)
        if len(new_peaks) == len(peaks):
            peaks = new_peaks
            break
        else:
            peaks = new_peaks

    # trim peaks
    new_peaks = []
    for start, stop in peaks:
        assert stop >= start
        cov_region = signal_cov[start:stop + 1]
        total_cov = cov_region.sum()
        cov_cumsum = cov_region.cumsum() - cov_region[0]
        try:
            trim_start = numpy.flatnonzero(
                cov_cumsum < int(trim_fraction * total_cov)).max()
        except:
            trim_start = 0
        try:
            trim_stop = numpy.flatnonzero(
                cov_cumsum > (1.0 - trim_fraction) * total_cov).min()
        except:
            trim_stop = len(cov_region) - 1
        while trim_start < len(cov_region) - 1 and cov_region[trim_start] == 0:
            trim_start += 1
        while trim_stop > trim_start and cov_region[trim_stop] == 0:
            trim_stop -= 1
        new_peaks.append((trim_start + start, trim_stop + start,
                          cov_region[trim_start:trim_stop + 1].sum()))

    # filter peaks
    exp_filtered_peaks = []
    max_peak_cnt = float(max(cnt for start, stop, cnt in new_peaks))
    max_peak_mean_cnt = float(
        max(cnt / float(stop - start + 1) for start, stop, cnt in new_peaks))
    for start, stop, cnt in new_peaks:
        length = stop - start + 1
        if (cnt >= min_rd_cnt and length >= min_peak_size
                and length <= max_peak_size
                and cnt / max_peak_cnt > max_exp_sum_fraction
                and (cnt / float(length)) / max_peak_mean_cnt >
                max_exp_mean_cvg_fraction):
            exp_filtered_peaks.append((start, stop, cnt))

    return exp_filtered_peaks
コード例 #39
0
def quantify_transcript_expression(
    promoter_reads, rnaseq_reads, polya_reads,
    pickled_gene_fnames, 
    ofname, sample_type=None, rep_id=None ):
    """Build transcripts
    """
    assert rnaseq_reads.fl_dists != None

    global SAMPLE_ID
    SAMPLE_ID=sample_type
    global REP_ID
    REP_ID = rep_id
    
    write_design_matrices=False

    if config.VERBOSE: config.log_statement( 
        "Initializing processing data" )        
    data = SharedData(pickled_gene_fnames)
    if config.VERBOSE: config.log_statement( 
        "Building design matrices" )
    build_design_matrices( data, rnaseq_reads.fl_dists,
                           (rnaseq_reads, promoter_reads, polya_reads))
    
    if config.VERBOSE: config.log_statement( 
        "Populating input queue from expression queue" )
    data.populate_expression_queue()
    if config.VERBOSE: config.log_statement( 
        "Estimating MLEs" )
    estimate_mles( data )

    if config.VERBOSE: config.log_statement( 
        "Calculating FPKMS and Writing mle's to output mle" )
    
    if config.ESTIMATE_LOWER_CONFIDENCE_BOUNDS:
        if config.VERBOSE: config.log_statement( 
            "Estimating lower confidence bounds" )
        estimate_confidence_bounds(data, 'lb')
        if config.VERBOSE: config.log_statement( 
            "FINISHED Estimating lower confidence bounds" )
    
    if config.ESTIMATE_UPPER_CONFIDENCE_BOUNDS:
        if config.VERBOSE: config.log_statement( 
            "Estimating upper confidence bounds" )
        estimate_confidence_bounds(data, 'ub')
        if config.VERBOSE: config.log_statement( 
            "FINISHED Estimating upper confidence bounds" )
    
    if config.VERBOSE: config.log_statement( 
        "Writing output data to tracking file" )

    expression_ofp = ThreadSafeFile(ofname, "w")
    write_data_to_tracking_file(data, rnaseq_reads.fl_dists, expression_ofp)    
    expression_ofp.close()
    
    return
コード例 #40
0
def find_all_gene_segments( rnaseq_reads, promoter_reads, polya_reads,
                            ref_genes, ref_elements_to_include,
                            region_to_use=None ):
    config.log_statement("Finding gene segments")

    contig_lens = dict(zip(*get_contigs_and_lens( 
        [ reads for reads in [rnaseq_reads, promoter_reads, polya_reads]
          if reads != None ] )))

    config.log_statement("Spawning gene segment finding children")    
    segments_queue = multiprocessing.Queue()
    global_gene_data = GlobalGeneSegmentData(contig_lens)
    
    ref_element_types_to_include = set()
    if ref_elements_to_include.junctions: 
        ref_element_types_to_include.add('intron')
    if ref_elements_to_include.TSS: 
        ref_element_types_to_include.add('tss_exon')
    if ref_elements_to_include.TES: 
        ref_element_types_to_include.add('tes_exon')
    if ref_elements_to_include.promoters: 
        ref_element_types_to_include.add('promoter')
    if ref_elements_to_include.polya_sites: 
        ref_element_types_to_include.add('polya')
    if ref_elements_to_include.exons: 
        ref_element_types_to_include.add('exon')
    # to give full gene connectivity
    if ref_elements_to_include.genes:
        ref_element_types_to_include.add('intron')
        ref_element_types_to_include.add('exon')
    
    pids = []
    for i in xrange(config.NTHREADS):
        pid = os.fork()
        if pid == 0:
            find_segments_and_jns_worker(
                segments_queue, 
                global_gene_data,
                rnaseq_reads, promoter_reads, polya_reads,
                ref_genes, ref_element_types_to_include)
            os._exit(0)
        pids.append(pid)

    config.log_statement("Populating gene segment queue")        
    segments = split_genome_into_segments(contig_lens, region_to_use)
    for segment in segments: 
        segments_queue.put(segment)
    for i in xrange(config.NTHREADS): segments_queue.put('FINISHED')
    
    while segments_queue.qsize() > 2*config.NTHREADS:
        config.log_statement(
            "Waiting on gene segment finding children (%i/%i segments remain)" 
            %(segments_queue.qsize(), len(segments)))        
        time.sleep(0.5)
    
    for i, pid in enumerate(pids):
        config.log_statement(
            "Waiting on gene segment finding children (%i/%i children remain)" 
            %(len(pids)-i, len(pids)))
        os.waitpid(pid, 0) 
            
    config.log_statement("Merging gene segments")
    merged_transcribed_regions = {}
    for key, intervals in global_gene_data.transcribed_regions.iteritems():
        merged_transcribed_regions[
            key] = merge_adjacent_intervals(
                intervals, config.MAX_EMPTY_REGION_SIZE)
    transcribed_regions = merged_transcribed_regions
    
    config.log_statement("Filtering junctions")    
    filtered_jns = defaultdict(dict)
    for contig in contig_lens.keys():
        plus_jns = defaultdict(int)
        for jn, cnt in global_gene_data.jns[(contig, '+')]: plus_jns[jn] += cnt
        minus_jns = defaultdict(int)
        for jn, cnt in global_gene_data.jns[(contig, '-')]: minus_jns[jn] += cnt
        filtered_jns[(contig, '+')] = filter_jns(plus_jns, minus_jns)
        filtered_jns[(contig, '-')] = filter_jns(minus_jns, plus_jns)

    config.log_statement("Building FL dist")        
    fl_dists = build_fl_dists_from_fls_dict(dict(global_gene_data.frag_lens))
        
    if ref_elements_to_include.junctions:
        for gene in ref_genes:
            for jn in gene.extract_elements()['intron']:
                if jn not in filtered_jns[(gene.chrm, gene.strand)]:
                    filtered_jns[(gene.chrm, gene.strand)][jn] = 0
        
    config.log_statement("Clustering gene segments")    
    # build bins for all of the genes and junctions, converting them to 1-based
    # in the process
    new_genes = []
    new_introns = []
    for contig, contig_len in contig_lens.iteritems():
        for strand in '+-':
            key = (contig, strand)
            jns = [ (start, stop, cnt) 
                    for (start, stop), cnt 
                    in sorted(filtered_jns[key].iteritems()) ]
            for start, stop, cnt in jns:
                new_introns.append(
                    SegmentBin(start, stop, ["D_JN",], ["R_JN",], "INTRON"))
            intervals = cluster_intron_connected_segments(
                transcribed_regions[key], 
                [(start, stop) for start, stop, cnt in jns ] )
            # add the intergenic space, since there could be interior genes
            for segments in intervals: 
                new_gene = GeneElements( contig, strand )
                for start, stop in segments:
                    new_gene.regions.append( 
                        SegmentBin(start, stop, ["ESTART",],["ESTOP",],"GENE"))
                if new_gene.stop-new_gene.start+1 < config.MIN_GENE_LENGTH: 
                    continue
                new_genes.append(new_gene)

    try: 
        num_unique_reads = ReadCounts(*[
            float(x.value) for x in global_gene_data.num_unique_reads])
    except AttributeError:
        num_unique_reads = ReadCounts(*global_gene_data.num_unique_reads)

    global_gene_data.shutdown()

    config.log_statement("")    
        
    return new_genes, fl_dists, num_unique_reads 
コード例 #41
0
ファイル: build_transcripts.py プロジェクト: nboley/grit
            if key not in grpd_exons: 
                args.append(set())
            else:
                exons = [tuple(x) for x in grpd_exons[key].tolist()
                         if x[0] >= g_start and x[1] <= g_stop]
                args.append(set(exons))
        yield args

def add_elements_for_contig_and_strand((contig, strand), 
                                       grpd_exons, elements, gene_id_cntr,
                                       output, gtf_ofp, tracking_ofp, 
                                       fasta_fp, ref_genes):
    if fasta_fp != None: fasta = Fastafile(fasta_fp.name)
    else: fasta = None
    
    config.log_statement( 
        "Clustering elements into genes for %s:%s" % ( contig, strand ) )

    """ old code that actually clustered elements
    args = []
    for key in ('tss_exon', 'internal_exon', 'tes_exon', 
                'single_exon_gene', 'promoter', 'polya', 'intron'):
        if key not in grpd_exons: 
            args.append(set())
        else:
            args.append(
                set(map(tuple, grpd_exons[key].tolist())))
    args.append(strand)
    """
    for ( tss_es, internal_es, tes_es,
          se_ts, promoters, polyas, jns ) in group_elements_in_gene(grpd_exons):
        # skip genes without all of the element types
コード例 #42
0
ファイル: genes.py プロジェクト: nboley/grit
def find_segments_and_jns_worker(
        segments, global_gene_data,
        rnaseq_reads, promoter_reads, polya_reads,
        ref_elements, ref_elements_to_include ):
    rnaseq_reads = rnaseq_reads.reload()
    if promoter_reads != None: 
        promoter_reads = promoter_reads.reload()
    if polya_reads != None: 
        polya_reads = polya_reads.reload()

    local_frag_lens = defaultdict(int)
    local_transcribed_regions = defaultdict(list)
    local_jns = defaultdict(list)
    local_rd_cnts = [0.0, 0.0, 0.0]
    
    # just use this to keep track of where we are in the queue
    length_of_segments = segments.qsize()
    while True:
        try: 
            config.log_statement("Waiting for segment")
            segment = segments.get(timeout=1.0)
        except Queue.Empty: 
            continue
        if segment == 'FINISHED': 
            config.log_statement("")
            break
        config.log_statement("Finding genes and jns in %s" % str(segment) )
        try:
            ( r_transcribed_regions, r_jns, r_n_unique_reads, r_frag_lens,
                ) = find_transcribed_regions_and_jns_in_segment(
                    segment, rnaseq_reads, promoter_reads, polya_reads, 
                    ref_elements, ref_elements_to_include) 
        except TooManyReadsError:
            seg1 = list(segment)
            seg1[2] = segment[1] + (segment[2]-segment[1])/2
            seg2 = list(segment)
            seg2[1] = seg1[2]
            segments.put(seg1)
            segments.put(seg2)
            config.log_statement("")
            continue

        for (rd_key, rls), fls in r_frag_lens.iteritems():
            for fl, cnt in fls.iteritems():
                local_frag_lens[(rd_key, rls, fl)] += cnt
        local_transcribed_regions[(segment[0], '+')].extend([
            (start+segment[1], stop+segment[1])
            for start, stop in r_transcribed_regions['+']])
        local_transcribed_regions[(segment[0], '-')].extend([
            (start+segment[1], stop+segment[1])
            for start, stop in r_transcribed_regions['-']])

        local_jns[(segment[0], '+')].extend(r_jns['+'])
        local_jns[(segment[0], '-')].extend(r_jns['-'])

        for i, val in enumerate(r_n_unique_reads):
            local_rd_cnts[i] += val

        if sum(local_rd_cnts) > 1e5:
            global_gene_data.update_all_data(
                local_frag_lens,
                local_transcribed_regions,
                local_jns,
                local_rd_cnts)
            
            local_frag_lens = defaultdict(int)
            local_transcribed_regions = defaultdict(list)
            local_jns = defaultdict(list)
            local_rd_cnts = [0.0, 0.0, 0.0]

    global_gene_data.update_all_data(
        local_frag_lens,
        local_transcribed_regions,
        local_jns,
        local_rd_cnts)
    
    return