Ejemplo n.º 1
0
Archivo: ORF.py Proyecto: bdgp/grit
def find_all_orfs(genes, fasta_fn, gtf_ofp, fa_ofp, num_threads=1):
    # create queues to store input and output data
    manager = multiprocessing.Manager()
    input_queue = manager.Queue()

    if MIN_VERBOSE:
        config.log_statement('Processing all transcripts for ORFs.')

    # populate input_queue
    for gene in genes:
        input_queue.put(gene)

    # spawn threads to find the orfs, and write them to the output streams
    args = (input_queue, gtf_ofp, fa_ofp, fasta_fn)
    if num_threads == 1:
        find_gene_orfs_worker(*args)
    else:
        processes = []
        for thread_id in xrange(num_threads):
            p = multiprocessing.Process(target=find_gene_orfs_worker,
                                        args=args)
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

    return
Ejemplo n.º 2
0
def merge_genes(all_sources_and_genes, ofp, sources_ofp):
    # write the gtf header
    ofp.write("track name=%s\n" % ofp.name)

    # group overlapping genes
    config.log_statement("Grouping genes", log=True)
    manager = multiprocessing.Manager()
    grpd_genes = manager.list()
    grpd_genes_lock = multiprocessing.Lock()

    for genes in group_overlapping_genes(all_sources_and_genes):
        grpd_genes.append(genes)

    # merge gene clustered transcripts
    config.log_statement("Merging transcripts", log=True)

    gene_id_cntr = multiprocessing.Value('i', 0)

    if config.NTHREADS == 1:
        merge_clustered_genes_worker(grpd_genes, grpd_genes_lock, ofp,
                                     sources_ofp, gene_id_cntr)
    else:
        pids = []
        for i in xrange(config.NTHREADS):
            pid = os.fork()
            if pid == 0:
                merge_clustered_genes_worker(grpd_genes, grpd_genes_lock, ofp,
                                             sources_ofp, gene_id_cntr)
                os._exit(0)
            else:
                pids.append(pid)
        for pid in pids:
            os.waitpid(pid, 0)

    return
Ejemplo n.º 3
0
def iter_coverage_intervals_for_read(read):
    # we loop through each contig in the cigar string to deal
    # with junctions reads.
    # note that the bam files are 0 based
    start = read.pos
    for contig_type, length in read.cigar:
        # if this is a match, add it 
        if contig_type == 0:
            yield ( start, start + length - 1 )
            start += length
        # skip reference insertions
        elif contig_type == 1:
            pass
            # start += length
        # move past refernce deletions
        elif contig_type == 2:
            start += length
        # skip past skipped regions
        elif contig_type == 3:
            start += length
        # since read positions dont include clipped regions, 
        # ignore clipping
        elif contig_type == 4 or contig_type == 5:
            pass
        else:
            config.log_statement("Unrecognized cigar format:", read.cigar)

    return
Ejemplo n.º 4
0
Archivo: ORF.py Proyecto: bdgp/grit
def find_gene_orfs_worker(input_queue, gtf_ofp, fa_ofp, fasta_fn):
    # open fasta file in each thread separately
    fasta = Fastafile(fasta_fn)

    # process genes for orfs until input queue is empty
    while not input_queue.empty():
        try:
            gene = input_queue.get(block=False)
        except Queue.Empty:
            break

        if VERBOSE: config.log_statement('\tProcessing ' + gene.id)
        ann_trans = find_cds_for_gene(gene, fasta, ONLY_USE_LONGEST_ORF)
        op_str = "\n".join(
            [tr.build_gtf_lines(gene.id, {}) for tr in ann_trans])
        gtf_ofp.write(op_str + "\n")

        if fa_ofp is not None:
            for trans in ann_trans:
                fa_ofp.write(">%s\n" % trans.id)
                for line in iter_x_char_lines(trans.coding_sequence):
                    fa_ofp.write(line + "\n")

        if VERBOSE: config.log_statement('\tFinished ' + gene.id)

    return
Ejemplo n.º 5
0
def get_read_group( r1, r2 ):        
    #return 'mean'
    r1_read_group = [ val for key, val in r1.tags if key == 'RG' ]
    r1_read_group = r1_read_group[0] if len( r1_read_group ) == 1 else 'mean'
    r2_read_group = [ val for key, val in r2.tags if key == 'RG' ]
    r2_read_group = r2_read_group[0] if len( r2_read_group ) == 1 else 'mean'
    if r1_read_group == r2_read_group:
        return r1_read_group
    else: 
        config.log_statement("WARNING: Read groups do not match.")
        return None
Ejemplo n.º 6
0
def main():
    gtf_fps, ofp, sources_ofp = parse_arguments()
    gtf_fnames = [os.path.abspath(fp.name) for fp in gtf_fps]
        
    config.log_statement("Loading gtfs")
    all_genes_and_fnames = load_multiple_gtfs_into_pickled_files(gtf_fnames)
    
    merge_genes(all_genes_and_fnames, ofp, sources_ofp)
    
    ofp.close()
    if sources_ofp is not None:
        sources_ofp.close()
Ejemplo n.º 7
0
def main():
    gtf_fps, ofp, sources_ofp = parse_arguments()
    gtf_fnames = [os.path.abspath(fp.name) for fp in gtf_fps]
        
    config.log_statement("Loading gtfs")
    all_genes_and_fnames = load_multiple_gtfs_into_pickled_files(gtf_fnames)
    
    merge_genes(all_genes_and_fnames, ofp, sources_ofp)
    
    ofp.close()
    if sources_ofp != None:
        sources_ofp.close()
Ejemplo n.º 8
0
def determine_read_pair_params( bam_obj, min_num_reads_to_check=50000, 
                                max_num_reads_to_check=100000 ):
    # keep track of which fractiona re on the sam strand
    paired_cnts = {'no_mate': 0, 'same_strand': 1e-4, 'diff_strand': 1e-4}
    
    num_good_reads = 0
    num_observed_reads = 0
    for read in bam_obj: 
        num_observed_reads += 1
        if num_observed_reads > max_num_reads_to_check:
            break
        
        if read.is_paired and read.mate_is_unmapped:
            continue
        
        map_prb = get_rd_posterior_prb(read)
        if map_prb < 0.99: continue
        
        if not read.is_paired:
            paired_cnts['no_mate'] += 1        
        elif read.is_reverse != read.mate_is_reverse:
            paired_cnts['diff_strand'] += 1
        else:
            paired_cnts['same_strand'] += 1
        # keep collecting reads until we observe enough
        num_good_reads += 1
        if num_good_reads > min_num_reads_to_check \
                and num_good_reads%min_num_reads_to_check == 0:
            # if the reads are single ended, then return True ( 
            #    because it doesnt really matter )
            if paired_cnts['no_mate'] >= 0.95*num_good_reads:
                return ('unpaired',)
            if float(paired_cnts['same_strand'])/paired_cnts['diff_strand'] > 5:
                return ('paired', 'same_strand')
            elif float(paired_cnts['diff_strand'])/paired_cnts['same_strand'] > 5:
                return ('paired', 'diff_strand')
    
    # if we have run out of reads, see if we can build the statistic
    if paired_cnts['no_mate'] >= 0.95*num_good_reads:
        return ('unpaired',)
    if float(paired_cnts['same_strand'])/paired_cnts['diff_strand'] > 5:
        return ('paired', 'same_strand')
    elif float(paired_cnts['diff_strand'])/paired_cnts['same_strand'] > 5:
        return ('paired', 'diff_strand')
    
    config.log_statement("Paired Cnts:", paired_cnts, "Num Reads", num_observed_reads)
    raise ValueError, "Reads appear to be a mix of unpaired and paired reads that are both on the same and different strands. (%s)" % paired_cnts
Ejemplo n.º 9
0
def fork_and_wait(n_proc, target, args=[]):
    """Fork n_proc processes, run target(*args) in each, and wait to finish.
    
    """
    if n_proc == 1:
        target(*args)
        return
    else:
        pids = []
        for i in xrange(n_proc):
            pid = os.fork()
            if pid == 0:
                try:
                    signal.signal(signal.SIGINT, handle_interrupt_signal)
                    target(*args)
                    os._exit(os.EX_OK)
                except Exception, inst:
                    config.log_statement("Uncaught exception in subprocess\n" +
                                         traceback.format_exc(),
                                         log=True)
                    os._exit(os.EX_SOFTWARE)
            else:
                pids.append(pid)
        try:
            while len(pids) > 0:
                ret_pid, error_code = os.wait()
                if ret_pid in pids:
                    pids.remove(ret_pid)
                if error_code != os.EX_OK:
                    raise OSError, "Process '{}' returned error code '{}'".format(
                        ret_pid, error_code)
        except KeyboardInterrupt:
            for pid in pids:
                try:
                    os.kill(pid, signal.SIGHUP)
                except:
                    pass
            raise
        except OSError:
            for pid in pids:
                try:
                    os.kill(pid, signal.SIGHUP)
                except:
                    pass
            raise
        return
Ejemplo n.º 10
0
Archivo: reads.py Proyecto: neevor/grit
    def init(self,
             reverse_read_strand,
             pairs_are_opp_strand=None,
             reads_are_paired=True,
             ref_genes=None):
        assert self.is_indexed()

        assert reads_are_paired, "GRIT can not use unpaired RAMPAGE reads."
        reads_are_stranded = True

        # reads strandedness
        if pairs_are_opp_strand == None:
            pairs_are_opp_strand = (not read_pairs_are_on_same_strand(self))

        if reverse_read_strand in ('auto', None):
            if ref_genes in ([], None):
                raise ValueError, "Determining reverse_read_strand requires reference genes"
            reverse_read_strand_params = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50)
            assert 'stranded' in reverse_read_strand_params
            if 'reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = True
            elif 'dont_reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = False
            else:
                assert False

            if config.VERBOSE:
                config.log_statement(
                    "Set reverse_read_strand to '%s' for '%s'" %
                    (reverse_read_strand, self.filename),
                    log=True)

        Reads.init(self, reads_are_paired, pairs_are_opp_strand,
                   reads_are_stranded, reverse_read_strand)

        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand,
            'pairs_are_opp_strand': pairs_are_opp_strand,
            'reads_are_paired': reads_are_paired
        }

        return self
Ejemplo n.º 11
0
Archivo: reads.py Proyecto: neevor/grit
    def init(self,
             reverse_read_strand=None,
             pairs_are_opp_strand=None,
             reads_are_paired=False,
             ref_genes=None):
        assert reverse_read_strand in ('auto', None, True, False), \
            "Invalid option for reverse read strand"
        reads_are_paired = False
        pairs_are_opp_strand = False
        assert not reads_are_paired, "GRIT can not use paired CAGE reads."

        # CAGE reads are always stranded
        reads_are_stranded = True

        if reverse_read_strand in ('auto', None):
            if ref_genes in ([], None):
                raise ValueError, "Determining reverse_read_strand requires reference genes"
            reverse_read_strand_params = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50)
            assert 'stranded' in reverse_read_strand_params
            if 'reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = True
            elif 'dont_reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = False
            else:
                assert False
            if config.VERBOSE:
                config.log_statement(
                    "Set reverse_read_strand to '%s' for '%s'" %
                    (reverse_read_strand, self.filename),
                    log=True)

        Reads.init(self, reads_are_paired, pairs_are_opp_strand,
                   reads_are_stranded, reverse_read_strand)

        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand,
            'pairs_are_opp_strand': pairs_are_opp_strand,
            'reads_are_paired': reads_are_paired
        }

        return self
Ejemplo n.º 12
0
    def iter_paired_reads( self, chrm, strand, start, stop ):
        # whether or not the gene is on the positive strand
        gene_strnd_is_rev = ( strand == '-' )
        chrm = clean_chr_name( chrm )

        # get all of the first pairs
        def iter_pair1_reads():
            for read in self.iter_reads(chrm, strand, start, stop):
                if read.is_read1: 
                    yield read
        
        # index the pair 2 reads
        reads_pair2 = {}
        for read in self.iter_reads(chrm, strand, start, stop):
            if not read.is_read1: 
                reads_pair2[read.qname] = read
        
        # iterate through the read pairs
        for read1 in iter_pair1_reads():
            try:
                read2 = reads_pair2[ read1.qname ]
            # if there is no mate, skip this read
            except KeyError:
                if DEBUG:
                    config.log_statement("No mate: ", read1.pos, read1.aend-1)
                continue

            assert read1.query is None or \
                   ( read1.alen == read1.aend - read1.pos ) \
                   or ( len( read1.cigar ) > 1 )
            assert read2.query is None or \
                   ( read2.alen == read2.aend - read2.pos ) \
                   or ( len( read2.cigar ) > 1 )
            
            #if read1.qlen != read2.qlen:
            #    config.log_statement( "ERROR: unequal read lengths %i and %i\n", \
            #           read1.qlen, read2.qlen )
            #    continue

            yield read1, read2

        return
Ejemplo n.º 13
0
def compare(ref_fname,
            gtf_fname,
            build_maps,
            build_maps_stats,
            out_prefix,
            num_threads=1):
    """Compare refernce to another 'gtf' annotation by element types
    """
    # load the gtf files
    ref_genes = load_gtf(ref_fname)
    t_genes = load_gtf(gtf_fname)

    output_stats = OutputStats(ref_fname, gtf_fname)

    # get recall and prceision stats for all types of exons and introns
    build_element_stats(ref_genes, t_genes, output_stats)
    if VERBOSE: config.log_statement("Finished building element stats")

    clustered_transcripts = cluster_overlapping_genes((ref_genes, t_genes))
    if VERBOSE:
        n_clusters = sum(
            len(val) for val in clustered_transcripts.itervalues())
        config.log_statement("Finished clustering genes into %i clusters." %
                             n_clusters)

    # calculate transcript overlaps and class match counts
    # also write map files if requested
    trans_class_cnts = \
        match_all_transcripts( clustered_transcripts, build_maps,
                               build_maps_stats, out_prefix, output_stats )

    if out_prefix is None:
        # dump stats to stdout
        config.log_statement(str(output_stats) + '\n')
    else:
        # prepare formated stats output
        op = [
            str(output_stats),
        ]
        if build_maps_stats:
            op.append(
                make_class_cnts_string(trans_class_cnts, ref_fname, gtf_fname))

        with open(out_prefix + ".stats", "w") as stats_fp:
            stats_fp.write("\n".join(op) + '\n')

        if VERBOSE:
            config.log_statement("\n".join(op) + '\n')

    return
Ejemplo n.º 14
0
Archivo: reads.py Proyecto: nboley/grit
    def init(self, reverse_read_strand, pairs_are_opp_strand=None,
             reads_are_paired=True, ref_genes=None ):
        assert self.is_indexed()

        assert reads_are_paired, "GRIT can not use unpaired RAMPAGE reads."
        reads_are_stranded = True

        # reads strandedness
        if pairs_are_opp_strand == None:
            pairs_are_opp_strand = (not read_pairs_are_on_same_strand( self ))

        if reverse_read_strand in ('auto', None):
            if ref_genes in([], None):
                raise ValueError, "Determining reverse_read_strand requires reference genes"
            reverse_read_strand_params = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'tss_exon',
                300, 50 )
            assert 'stranded' in reverse_read_strand_params
            if 'reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = True
            elif 'dont_reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = False
            else: assert False

            if config.VERBOSE:
                config.log_statement(
                    "Set reverse_read_strand to '%s' for '%s'" % (
                        reverse_read_strand, self.filename), log=True )


        Reads.init(self, reads_are_paired, pairs_are_opp_strand,
                         reads_are_stranded, reverse_read_strand )

        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand,
            'pairs_are_opp_strand': pairs_are_opp_strand,
            'reads_are_paired': reads_are_paired
        }

        return self
Ejemplo n.º 15
0
Archivo: reads.py Proyecto: nboley/grit
    def init(self, reverse_read_strand=None, pairs_are_opp_strand=None,
             reads_are_paired=False, ref_genes=None ):
        assert reverse_read_strand in ('auto', None, True, False), \
            "Invalid option for reverse read strand"
        reads_are_paired=False
        pairs_are_opp_strand = False
        assert not reads_are_paired, "GRIT can not use paired CAGE reads."

        # CAGE reads are always stranded
        reads_are_stranded = True

        if reverse_read_strand in ('auto', None):
            if ref_genes in([], None):
                raise ValueError, "Determining reverse_read_strand requires reference genes"
            reverse_read_strand_params = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'tss_exon',
                300, 50 )
            assert 'stranded' in reverse_read_strand_params
            if 'reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = True
            elif 'dont_reverse_read_strand' in reverse_read_strand_params:
                reverse_read_strand = False
            else: assert False
            if config.VERBOSE:
                config.log_statement(
                    "Set reverse_read_strand to '%s' for '%s'" % (
                        reverse_read_strand, self.filename), log=True )


        Reads.init(self, reads_are_paired, pairs_are_opp_strand,
                         reads_are_stranded, reverse_read_strand )

        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand,
            'pairs_are_opp_strand': pairs_are_opp_strand,
            'reads_are_paired': reads_are_paired
        }

        return self
Ejemplo n.º 16
0
def load_junctions_worker(all_jns, all_jns_lock, segments_queue,
                          segments_queue_lock, reads):
    jns = defaultdict(list)
    while len(segments_queue) > 0:
        with segments_queue_lock:
            if len(segments_queue) == 0: break
            chrm, strand, start, stop = segments_queue.pop()
        if config.VERBOSE:
            config.log_statement("Finding jns in '%s:%s:%i:%i'" %
                                 (chrm, strand, start, stop))
        jns[(chrm, strand)].extend(
            extract_junctions_in_region(reads, chrm, strand, start, stop,
                                        True))

    # finally, block until we can offload the remaining junctions
    with all_jns_lock:
        for key, region_jns in jns.iteritems():
            if key not in all_jns: all_jns_key = []
            else: all_jns_key = all_jns[key]
            all_jns_key.extend(region_jns)
            all_jns[key] = all_jns_key
    del jns
    return
Ejemplo n.º 17
0
def merge_genes(all_sources_and_genes, ofp, sources_ofp):
    # write the gtf header
    ofp.write("track name=%s\n" % ofp.name)
    
    # group overlapping genes
    config.log_statement("Grouping genes", log=True)
    manager = multiprocessing.Manager()
    grpd_genes = manager.list()
    grpd_genes_lock = multiprocessing.Lock()

    for genes in group_overlapping_genes(all_sources_and_genes):
        grpd_genes.append(genes)
    
    # merge gene clustered transcripts
    config.log_statement("Merging transcripts", log=True)
    
    gene_id_cntr = multiprocessing.Value('i', 0)

    if config.NTHREADS == 1:
        merge_clustered_genes_worker(
            grpd_genes, grpd_genes_lock, 
            ofp, sources_ofp, gene_id_cntr)
    else:
        pids = []
        for i in xrange(config.NTHREADS):
            pid = os.fork()
            if pid == 0:
                merge_clustered_genes_worker(
                    grpd_genes, grpd_genes_lock, 
                    ofp, sources_ofp, gene_id_cntr)
                os._exit(0)
            else:
                pids.append(pid)
        for pid in pids:
            os.waitpid(pid, 0)

    return
Ejemplo n.º 18
0
def load_junctions_worker(all_jns, all_jns_lock, 
                          segments_queue, segments_queue_lock, reads):
    jns = defaultdict(list)
    while len(segments_queue) > 0:
        with segments_queue_lock:
            if len(segments_queue) == 0: break
            chrm, strand, start, stop = segments_queue.pop()
        if config.VERBOSE: 
            config.log_statement("Finding jns in '%s:%s:%i:%i'" % 
                          (chrm, strand, start, stop))
        jns[(chrm, strand)].extend(
            extract_junctions_in_region(
                reads, chrm, strand, start, stop, True))
    
    # finally, block until we can offload the remaining junctions
    with all_jns_lock:
        for key, region_jns in jns.iteritems():
            if key not in all_jns: all_jns_key = []
            else: all_jns_key = all_jns[key]
            all_jns_key.extend( region_jns )
            all_jns[key] = all_jns_key
    del jns
    if config.VERBOSE: config.log_statement( "" )
    return
Ejemplo n.º 19
0
def load_junctions_in_bam( reads, regions=None, nthreads=1):
    if regions == None:
        regions = []
        for contig, contig_len in zip(*get_contigs_and_lens([reads,])):
            for strand in '+-':
                regions.append( (contig, strand, 0, contig_len) )
    
    if nthreads == 1:
        jns = defaultdict(list)
        for chrm, strand, region_start, region_stop in regions:
            jns[(chrm, strand)].extend( extract_junctions_in_region( 
                    reads, chrm, strand, region_start, region_stop ) )
        return jns
    else:
        from multiprocessing import Process, Manager
        manager = Manager()
        all_jns = manager.dict()
        all_jns_lock = multiprocessing.Lock()
        
        segments_queue = manager.list()
        segments_queue_lock = multiprocessing.Lock()
        
        for chrm, strand, region_start, region_stop in regions:
            # add all the regions to search for junctions in
            seg_len = min(5000, int((region_stop - region_start + 1)/nthreads))
            pos = region_start
            while pos < region_stop:
                segments_queue.append( (chrm, strand, pos, pos+seg_len) )
                pos += seg_len
            # make sure the last region doesnt exten past the stop
            segments_queue[-1] = (
                chrm, strand, segments_queue[-1][2], region_stop)
        
        ps = []
        for i in xrange(nthreads):
            p = Process(target=load_junctions_worker,
                        args=( all_jns, all_jns_lock, 
                               segments_queue, segments_queue_lock, reads))
            
            p.start()
            ps.append( p )

        if config.VERBOSE:
            config.log_statement( "Waiting on jn finding children" )
        while len(segments_queue) > 0:
            if config.VERBOSE:
                config.log_statement( 
                    "Waiting on jn finding children (%i in queue)" 
                    % len(segments_queue) )
            time.sleep( 0.5 )

        if config.VERBOSE:
            config.log_statement("Waiting on jn finding children (0 in queue)")
        for p in ps: p.join()
        #while any( not p.is_alive() for p in ps ):

        if config.VERBOSE:
            config.log_statement("Merging junctions from threads")
        junctions = {}
        for key in all_jns.keys():
            junctions[key] = sorted(all_jns[key])
        return junctions
    assert False
Ejemplo n.º 20
0
def load_junctions_in_bam(reads, regions=None, nthreads=1):
    if regions == None:
        regions = []
        for contig, contig_len in zip(*get_contigs_and_lens([
                reads,
        ])):
            for strand in '+-':
                regions.append((contig, strand, 0, contig_len))

    if nthreads == 1:
        jns = defaultdict(list)
        for chrm, strand, region_start, region_stop in regions:
            jns[(chrm, strand)].extend(
                extract_junctions_in_region(reads, chrm, strand, region_start,
                                            region_stop))
        return jns
    else:
        from multiprocessing import Process, Manager
        manager = Manager()
        all_jns = manager.dict()
        all_jns_lock = multiprocessing.Lock()

        segments_queue = manager.list()
        segments_queue_lock = multiprocessing.Lock()

        for chrm, strand, region_start, region_stop in regions:
            # add all the regions to search for junctions in
            seg_len = min(5000, int(
                (region_stop - region_start + 1) / nthreads))
            pos = region_start
            while pos < region_stop:
                segments_queue.append((chrm, strand, pos, pos + seg_len))
                pos += seg_len
            # make sure the last region doesnt exten past the stop
            segments_queue[-1] = (chrm, strand, segments_queue[-1][2],
                                  region_stop)

        ps = []
        for i in xrange(nthreads):
            p = Process(target=load_junctions_worker,
                        args=(all_jns, all_jns_lock, segments_queue,
                              segments_queue_lock, reads))

            p.start()
            ps.append(p)

        if config.VERBOSE:
            config.log_statement("Waiting on jn finding children")
        while len(segments_queue) > 0:
            if config.VERBOSE:
                config.log_statement(
                    "Waiting on jn finding children (%i in queue)" %
                    len(segments_queue))
            time.sleep(0.5)

        if config.VERBOSE:
            config.log_statement("Waiting on jn finding children (0 in queue)")
        for p in ps:
            p.join()
        #while any( not p.is_alive() for p in ps ):

        if config.VERBOSE:
            config.log_statement("Merging junctions from threads")
        junctions = {}
        for key in all_jns.keys():
            junctions[key] = sorted(all_jns[key])
        return junctions
    assert False
Ejemplo n.º 21
0
    def init(self, reverse_read_strand=None, reads_are_stranded=None, 
                   pairs_are_opp_strand=None, reads_are_paired=None,
                   ref_genes=None):        
        assert self.is_indexed()

        read_pair_params = determine_read_pair_params(self)
        
        # set whether the reads are paired or not
        if reads_are_paired in ('auto', None):
            if 'paired' in read_pair_params:
                reads_are_paired = True 
            else:
                assert 'unpaired' in read_pair_params
                reads_are_paired = False
        
        if pairs_are_opp_strand in ('auto', None):
            if not reads_are_paired:
                pairs_are_opp_strand = None
            elif 'same_strand' in read_pair_params:
                pairs_are_opp_strand = False
            else:
                pairs_are_opp_strand = True

        if ( reads_are_stranded in ('auto', None) 
             or reverse_read_strand in ('auto', None) ):
            read_strand_attributes = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'internal_exon',
                300, 50, 10 )
            config.log_statement(
                "read_strand_attributes = %s" % (read_strand_attributes,), log=True)
            if 'unstranded' in read_strand_attributes:
                if reads_are_stranded in ('auto', None):
                    reads_are_stranded = False
            elif 'stranded' in read_strand_attributes:
                if reads_are_stranded in ('auto', None):
                    reads_are_stranded = True
            else:
                assert False
            if config.VERBOSE:
                config.log_statement(
                    "Set reads_are_stranded to '%s' for '%s'" % (
                        reads_are_stranded, self.filename), log=True )
                
            if reverse_read_strand in ('auto', None):
                if not reads_are_stranded: 
                    reverse_read_strand = None
                elif 'reverse_read_strand' in read_strand_attributes:
                    reverse_read_strand = True
                elif 'dont_reverse_read_strand' in read_strand_attributes:
                    reverse_read_strand = False
                else:
                    reverse_read_strand = None
                if config.VERBOSE:
                    config.log_statement(
                        "Set reverse_read_strand to '%s' for '%s'" % (
                            reverse_read_strand, self.filename), log=True )
        
        Reads.init(self, reads_are_paired, pairs_are_opp_strand, 
                         reads_are_stranded, reverse_read_strand )
        
        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand, 
            'reads_are_stranded': reads_are_stranded, 
            'pairs_are_opp_strand': pairs_are_opp_strand, 
            'reads_are_paired': reads_are_paired,
            'ref_genes': ref_genes
        }
        
        return self
Ejemplo n.º 22
0
def parse_arguments():
    import argparse

    parser = argparse.ArgumentParser(\
        description='Produce simulated reads in a perfecty aligned BAM file.' )

    # gtf is the only required argument
    parser.add_argument( 'gtf', type=file, \
                             help='GTF file from which to produce simulated reads ' + \
                             '(Note: Only the first trascript from this file will ' + \
                             'be simulated)' )

    parser.add_argument('--assay',
                        choices=['RNAseq', 'RAMPAGE', 'CAGE', 'PASseq'],
                        default='RNAseq',
                        help='Which assay type to simulate from')

    # fragment length distribution options
    parser.add_argument( '--fl-dist-const', type=int, default=DEFAULT_FRAG_LENGTH, \
                             help='Constant length fragments. (default: ' + \
                             '%(default)s)' )
    parser.add_argument( '--fl-dist-norm', \
                             help='Mean and standard deviation (format "mn:sd") ' + \
                             'used to create normally distributed fragment lengths.' )

    # files providing quality and sequnce information
    parser.add_argument( '--fasta', '-f', \
                             help='Fasta file from which to create reads ' + \
                             '(default: all sequences are "' + DEFAULT_BASE + \
                             '" * length of sequence)' )
    parser.add_argument( '--quality', '-q', \
                             help='Flat file containing one FASTQ quality score ' + \
                             'per line, created with get_quals.sh. (default: ' + \
                             'quality strings are  "' + str(DEFAULT_QUALITY_SCORE) + \
                             '" * length of sequence.)' )

    # type and number of fragments requested
    parser.add_argument(
        '--num-frags',
        '-n',
        type=int,
        default=1000,
        help='Total number of fragments to create across all trascripts')
    parser.add_argument('--single-end',
                        action='store_true',
                        default=False,
                        help='Produce single-end reads.')
    parser.add_argument('--paired-end',
                        dest='single_end',
                        action='store_false',
                        help='Produce paired-end reads. (default)')
    # XXX not sure if this works
    #parser.add_argument(
    #    '--full-fragment', action='store_true', default=False,
    #    help='Produce reads spanning the entire fragment.')

    parser.add_argument( '--read-len', '-r', type=int, default=DEFAULT_READ_LENGTH, \
                             help='Length of reads to produce in base pairs ' + \
                             '(default: %(default)s)' )

    # output options
    parser.add_argument( '--out_prefix', '-o', default='simulated_reads', \
                             help='Prefix for output FASTQ/BAM file ' + \
                             '(default: %(default)s)' )
    parser.add_argument( '--verbose', '-v', default=False, action='store_true', \
                             help='Print status information.' )

    args = parser.parse_args()
    # set to false, but we may want to bring this option back
    args.full_fragment = False

    global VERBOSE
    VERBOSE = args.verbose

    if args.assay == 'CAGE':
        args.read_len = 28
        args.single_end = True

    # parse normal distribution argument
    if args.fl_dist_norm:
        try:
            mean, sd = args.fl_dist_norm.split(':')
            args.fl_dist_norm = [int(mean), int(sd)]
        except ValueError:
            args.fl_dist_norm = None
            config.log_statement(
              "WARNING: User input mean and sd are not formatted correctly.\n"+\
              "\tUsing default values.\n")

    return (args.gtf, args.fl_dist_const, args.fl_dist_norm, args.fasta,
            args.quality, args.num_frags, args.single_end, args.full_fragment,
            args.read_len, args.out_prefix, args.assay)
Ejemplo n.º 23
0
    num_unique_reads = 0.0
    
    #config.log_statement("Finding reads in %s" % str((chrm, strand, r_start, r_stop)))        
    for n_obs_reads, (read, rd_strand) in enumerate(reads.iter_reads_and_strand(
            chrm, r_start, r_stop+1)):
        # break if we've surpassed the read
        if read.pos > r_stop: break
        
        # -probability that the read originated in this location
        # if we can't find it, assume that it's uniform over alternate
        # mappings. If we can't find that, then assume that it's unique
        map_prb = get_rd_posterior_prb(read)

        if n_obs_reads > 0 and n_obs_reads%100000 == 0:
            config.log_statement("Processed %i reads in %s" % (
                n_obs_reads, str((chrm, strand, r_start, r_stop))))
        for jn in junctions.iter_jns_in_read(read):
            # skip jns whose start does not overlap this region, we subtract one
            # because the start refers to the first covered intron base, and 
            # we are talking about covered regions
            if jn[0]-1 < r_start or jn[0]-1 > r_stop: continue
            jn_reads[rd_strand][jn] += 1
        # if this is an anti-strand read, then we only care about the jns
        if strand != '.' and rd_strand != strand: continue
        
        # extract the information we care about:
        # -strand, alread have this
        # -regions covered
        cov_regions = list(iter_coverage_intervals_for_read(read))
        # -read length
        read_len = read.inferred_length
Ejemplo n.º 24
0
Archivo: reads.py Proyecto: nboley/grit
    reg_len = r_stop-r_start+1

    jn_reads = {'+': defaultdict(int), '-': defaultdict(int)}

    cov = {
        '+': numpy.zeros(reg_len, dtype=float),
        '-': numpy.zeros(reg_len, dtype=float)
    }

    pair1_reads = defaultdict(list)
    pair2_reads = defaultdict(list)

    num_unique_reads = 0.0

    config.log_statement("Finding reads in %s" % str((chrm, strand, r_start, r_stop)))
    for n_obs_reads, (read, rd_strand) in enumerate(reads.iter_reads_and_strand(
            chrm, r_start, r_stop+1)):
        # break if we've surpassed the read
        if read.pos > r_stop:
            break

        # -probability that the read originated in this location
        # if we can't find it, assume that it's uniform over alternate
        # mappings. If we can't find that, then assume that it's unique
        map_prb = get_rd_posterior_prb(read)

        if n_obs_reads > 0 and n_obs_reads%100000 == 0:
            config.log_statement("Processed %i reads in %s" % (
                n_obs_reads, str((chrm, strand, r_start, r_stop))))
        for jn in junctions.iter_jns_in_read(read):
Ejemplo n.º 25
0
Archivo: reads.py Proyecto: nboley/grit
    def init(self, reverse_read_strand=None, reads_are_stranded=None,
                   pairs_are_opp_strand=None, reads_are_paired=None,
                   ref_genes=None):
        assert self.is_indexed()

        read_pair_params = determine_read_pair_params(self)

        # set whether the reads are paired or not
        if reads_are_paired in ('auto', None):
            if 'paired' in read_pair_params:
                reads_are_paired = True
            else:
                assert 'unpaired' in read_pair_params
                reads_are_paired = False

        if pairs_are_opp_strand in ('auto', None):
            if not reads_are_paired:
                pairs_are_opp_strand = None
            elif 'same_strand' in read_pair_params:
                pairs_are_opp_strand = False
            else:
                pairs_are_opp_strand = True

        if ( reads_are_stranded in ('auto', None)
             or reverse_read_strand in ('auto', None) ):
            read_strand_attributes = determine_read_strand_params(
                self, ref_genes, pairs_are_opp_strand, 'internal_exon',
                300, 50 )
            if 'unstranded' in read_strand_attributes:
                if reads_are_stranded in ('auto', None):
                    reads_are_stranded = False
            elif 'stranded' in read_strand_attributes:
                if reads_are_stranded in ('auto', None):
                    reads_are_stranded = True
            else:
                assert False
            if config.VERBOSE:
                config.log_statement(
                    "Set reads_are_stranded to '%s' for '%s'" % (
                        reads_are_stranded, self.filename), log=True )

            if reverse_read_strand in ('auto', None):
                if not reads_are_stranded:
                    reverse_read_strand = None
                elif 'reverse_read_strand' in read_strand_attributes:
                    reverse_read_strand = True
                elif 'dont_reverse_read_strand' in read_strand_attributes:
                    reverse_read_strand = False
                else:
                    assert False
                if config.VERBOSE:
                    config.log_statement(
                        "Set reverse_read_strand to '%s' for '%s'" % (
                            reverse_read_strand, self.filename), log=True )

        Reads.init(self, reads_are_paired, pairs_are_opp_strand,
                         reads_are_stranded, reverse_read_strand )

        self._init_kwargs = {
            'reverse_read_strand': reverse_read_strand,
            'reads_are_stranded': reads_are_stranded,
            'pairs_are_opp_strand': pairs_are_opp_strand,
            'reads_are_paired': reads_are_paired
        }

        return self