def find_all_orfs(genes, fasta_fn, gtf_ofp, fa_ofp, num_threads=1): # create queues to store input and output data manager = multiprocessing.Manager() input_queue = manager.Queue() if MIN_VERBOSE: config.log_statement('Processing all transcripts for ORFs.') # populate input_queue for gene in genes: input_queue.put(gene) # spawn threads to find the orfs, and write them to the output streams args = (input_queue, gtf_ofp, fa_ofp, fasta_fn) if num_threads == 1: find_gene_orfs_worker(*args) else: processes = [] for thread_id in xrange(num_threads): p = multiprocessing.Process(target=find_gene_orfs_worker, args=args) p.start() processes.append(p) for p in processes: p.join() return
def merge_genes(all_sources_and_genes, ofp, sources_ofp): # write the gtf header ofp.write("track name=%s\n" % ofp.name) # group overlapping genes config.log_statement("Grouping genes", log=True) manager = multiprocessing.Manager() grpd_genes = manager.list() grpd_genes_lock = multiprocessing.Lock() for genes in group_overlapping_genes(all_sources_and_genes): grpd_genes.append(genes) # merge gene clustered transcripts config.log_statement("Merging transcripts", log=True) gene_id_cntr = multiprocessing.Value('i', 0) if config.NTHREADS == 1: merge_clustered_genes_worker(grpd_genes, grpd_genes_lock, ofp, sources_ofp, gene_id_cntr) else: pids = [] for i in xrange(config.NTHREADS): pid = os.fork() if pid == 0: merge_clustered_genes_worker(grpd_genes, grpd_genes_lock, ofp, sources_ofp, gene_id_cntr) os._exit(0) else: pids.append(pid) for pid in pids: os.waitpid(pid, 0) return
def iter_coverage_intervals_for_read(read): # we loop through each contig in the cigar string to deal # with junctions reads. # note that the bam files are 0 based start = read.pos for contig_type, length in read.cigar: # if this is a match, add it if contig_type == 0: yield ( start, start + length - 1 ) start += length # skip reference insertions elif contig_type == 1: pass # start += length # move past refernce deletions elif contig_type == 2: start += length # skip past skipped regions elif contig_type == 3: start += length # since read positions dont include clipped regions, # ignore clipping elif contig_type == 4 or contig_type == 5: pass else: config.log_statement("Unrecognized cigar format:", read.cigar) return
def find_gene_orfs_worker(input_queue, gtf_ofp, fa_ofp, fasta_fn): # open fasta file in each thread separately fasta = Fastafile(fasta_fn) # process genes for orfs until input queue is empty while not input_queue.empty(): try: gene = input_queue.get(block=False) except Queue.Empty: break if VERBOSE: config.log_statement('\tProcessing ' + gene.id) ann_trans = find_cds_for_gene(gene, fasta, ONLY_USE_LONGEST_ORF) op_str = "\n".join( [tr.build_gtf_lines(gene.id, {}) for tr in ann_trans]) gtf_ofp.write(op_str + "\n") if fa_ofp is not None: for trans in ann_trans: fa_ofp.write(">%s\n" % trans.id) for line in iter_x_char_lines(trans.coding_sequence): fa_ofp.write(line + "\n") if VERBOSE: config.log_statement('\tFinished ' + gene.id) return
def get_read_group( r1, r2 ): #return 'mean' r1_read_group = [ val for key, val in r1.tags if key == 'RG' ] r1_read_group = r1_read_group[0] if len( r1_read_group ) == 1 else 'mean' r2_read_group = [ val for key, val in r2.tags if key == 'RG' ] r2_read_group = r2_read_group[0] if len( r2_read_group ) == 1 else 'mean' if r1_read_group == r2_read_group: return r1_read_group else: config.log_statement("WARNING: Read groups do not match.") return None
def main(): gtf_fps, ofp, sources_ofp = parse_arguments() gtf_fnames = [os.path.abspath(fp.name) for fp in gtf_fps] config.log_statement("Loading gtfs") all_genes_and_fnames = load_multiple_gtfs_into_pickled_files(gtf_fnames) merge_genes(all_genes_and_fnames, ofp, sources_ofp) ofp.close() if sources_ofp is not None: sources_ofp.close()
def main(): gtf_fps, ofp, sources_ofp = parse_arguments() gtf_fnames = [os.path.abspath(fp.name) for fp in gtf_fps] config.log_statement("Loading gtfs") all_genes_and_fnames = load_multiple_gtfs_into_pickled_files(gtf_fnames) merge_genes(all_genes_and_fnames, ofp, sources_ofp) ofp.close() if sources_ofp != None: sources_ofp.close()
def determine_read_pair_params( bam_obj, min_num_reads_to_check=50000, max_num_reads_to_check=100000 ): # keep track of which fractiona re on the sam strand paired_cnts = {'no_mate': 0, 'same_strand': 1e-4, 'diff_strand': 1e-4} num_good_reads = 0 num_observed_reads = 0 for read in bam_obj: num_observed_reads += 1 if num_observed_reads > max_num_reads_to_check: break if read.is_paired and read.mate_is_unmapped: continue map_prb = get_rd_posterior_prb(read) if map_prb < 0.99: continue if not read.is_paired: paired_cnts['no_mate'] += 1 elif read.is_reverse != read.mate_is_reverse: paired_cnts['diff_strand'] += 1 else: paired_cnts['same_strand'] += 1 # keep collecting reads until we observe enough num_good_reads += 1 if num_good_reads > min_num_reads_to_check \ and num_good_reads%min_num_reads_to_check == 0: # if the reads are single ended, then return True ( # because it doesnt really matter ) if paired_cnts['no_mate'] >= 0.95*num_good_reads: return ('unpaired',) if float(paired_cnts['same_strand'])/paired_cnts['diff_strand'] > 5: return ('paired', 'same_strand') elif float(paired_cnts['diff_strand'])/paired_cnts['same_strand'] > 5: return ('paired', 'diff_strand') # if we have run out of reads, see if we can build the statistic if paired_cnts['no_mate'] >= 0.95*num_good_reads: return ('unpaired',) if float(paired_cnts['same_strand'])/paired_cnts['diff_strand'] > 5: return ('paired', 'same_strand') elif float(paired_cnts['diff_strand'])/paired_cnts['same_strand'] > 5: return ('paired', 'diff_strand') config.log_statement("Paired Cnts:", paired_cnts, "Num Reads", num_observed_reads) raise ValueError, "Reads appear to be a mix of unpaired and paired reads that are both on the same and different strands. (%s)" % paired_cnts
def fork_and_wait(n_proc, target, args=[]): """Fork n_proc processes, run target(*args) in each, and wait to finish. """ if n_proc == 1: target(*args) return else: pids = [] for i in xrange(n_proc): pid = os.fork() if pid == 0: try: signal.signal(signal.SIGINT, handle_interrupt_signal) target(*args) os._exit(os.EX_OK) except Exception, inst: config.log_statement("Uncaught exception in subprocess\n" + traceback.format_exc(), log=True) os._exit(os.EX_SOFTWARE) else: pids.append(pid) try: while len(pids) > 0: ret_pid, error_code = os.wait() if ret_pid in pids: pids.remove(ret_pid) if error_code != os.EX_OK: raise OSError, "Process '{}' returned error code '{}'".format( ret_pid, error_code) except KeyboardInterrupt: for pid in pids: try: os.kill(pid, signal.SIGHUP) except: pass raise except OSError: for pid in pids: try: os.kill(pid, signal.SIGHUP) except: pass raise return
def init(self, reverse_read_strand, pairs_are_opp_strand=None, reads_are_paired=True, ref_genes=None): assert self.is_indexed() assert reads_are_paired, "GRIT can not use unpaired RAMPAGE reads." reads_are_stranded = True # reads strandedness if pairs_are_opp_strand == None: pairs_are_opp_strand = (not read_pairs_are_on_same_strand(self)) if reverse_read_strand in ('auto', None): if ref_genes in ([], None): raise ValueError, "Determining reverse_read_strand requires reference genes" reverse_read_strand_params = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50) assert 'stranded' in reverse_read_strand_params if 'reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = True elif 'dont_reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = False else: assert False if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % (reverse_read_strand, self.filename), log=True) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired } return self
def init(self, reverse_read_strand=None, pairs_are_opp_strand=None, reads_are_paired=False, ref_genes=None): assert reverse_read_strand in ('auto', None, True, False), \ "Invalid option for reverse read strand" reads_are_paired = False pairs_are_opp_strand = False assert not reads_are_paired, "GRIT can not use paired CAGE reads." # CAGE reads are always stranded reads_are_stranded = True if reverse_read_strand in ('auto', None): if ref_genes in ([], None): raise ValueError, "Determining reverse_read_strand requires reference genes" reverse_read_strand_params = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50) assert 'stranded' in reverse_read_strand_params if 'reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = True elif 'dont_reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = False else: assert False if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % (reverse_read_strand, self.filename), log=True) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired } return self
def iter_paired_reads( self, chrm, strand, start, stop ): # whether or not the gene is on the positive strand gene_strnd_is_rev = ( strand == '-' ) chrm = clean_chr_name( chrm ) # get all of the first pairs def iter_pair1_reads(): for read in self.iter_reads(chrm, strand, start, stop): if read.is_read1: yield read # index the pair 2 reads reads_pair2 = {} for read in self.iter_reads(chrm, strand, start, stop): if not read.is_read1: reads_pair2[read.qname] = read # iterate through the read pairs for read1 in iter_pair1_reads(): try: read2 = reads_pair2[ read1.qname ] # if there is no mate, skip this read except KeyError: if DEBUG: config.log_statement("No mate: ", read1.pos, read1.aend-1) continue assert read1.query is None or \ ( read1.alen == read1.aend - read1.pos ) \ or ( len( read1.cigar ) > 1 ) assert read2.query is None or \ ( read2.alen == read2.aend - read2.pos ) \ or ( len( read2.cigar ) > 1 ) #if read1.qlen != read2.qlen: # config.log_statement( "ERROR: unequal read lengths %i and %i\n", \ # read1.qlen, read2.qlen ) # continue yield read1, read2 return
def compare(ref_fname, gtf_fname, build_maps, build_maps_stats, out_prefix, num_threads=1): """Compare refernce to another 'gtf' annotation by element types """ # load the gtf files ref_genes = load_gtf(ref_fname) t_genes = load_gtf(gtf_fname) output_stats = OutputStats(ref_fname, gtf_fname) # get recall and prceision stats for all types of exons and introns build_element_stats(ref_genes, t_genes, output_stats) if VERBOSE: config.log_statement("Finished building element stats") clustered_transcripts = cluster_overlapping_genes((ref_genes, t_genes)) if VERBOSE: n_clusters = sum( len(val) for val in clustered_transcripts.itervalues()) config.log_statement("Finished clustering genes into %i clusters." % n_clusters) # calculate transcript overlaps and class match counts # also write map files if requested trans_class_cnts = \ match_all_transcripts( clustered_transcripts, build_maps, build_maps_stats, out_prefix, output_stats ) if out_prefix is None: # dump stats to stdout config.log_statement(str(output_stats) + '\n') else: # prepare formated stats output op = [ str(output_stats), ] if build_maps_stats: op.append( make_class_cnts_string(trans_class_cnts, ref_fname, gtf_fname)) with open(out_prefix + ".stats", "w") as stats_fp: stats_fp.write("\n".join(op) + '\n') if VERBOSE: config.log_statement("\n".join(op) + '\n') return
def init(self, reverse_read_strand, pairs_are_opp_strand=None, reads_are_paired=True, ref_genes=None ): assert self.is_indexed() assert reads_are_paired, "GRIT can not use unpaired RAMPAGE reads." reads_are_stranded = True # reads strandedness if pairs_are_opp_strand == None: pairs_are_opp_strand = (not read_pairs_are_on_same_strand( self )) if reverse_read_strand in ('auto', None): if ref_genes in([], None): raise ValueError, "Determining reverse_read_strand requires reference genes" reverse_read_strand_params = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50 ) assert 'stranded' in reverse_read_strand_params if 'reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = True elif 'dont_reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = False else: assert False if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % ( reverse_read_strand, self.filename), log=True ) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand ) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired } return self
def init(self, reverse_read_strand=None, pairs_are_opp_strand=None, reads_are_paired=False, ref_genes=None ): assert reverse_read_strand in ('auto', None, True, False), \ "Invalid option for reverse read strand" reads_are_paired=False pairs_are_opp_strand = False assert not reads_are_paired, "GRIT can not use paired CAGE reads." # CAGE reads are always stranded reads_are_stranded = True if reverse_read_strand in ('auto', None): if ref_genes in([], None): raise ValueError, "Determining reverse_read_strand requires reference genes" reverse_read_strand_params = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'tss_exon', 300, 50 ) assert 'stranded' in reverse_read_strand_params if 'reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = True elif 'dont_reverse_read_strand' in reverse_read_strand_params: reverse_read_strand = False else: assert False if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % ( reverse_read_strand, self.filename), log=True ) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand ) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired } return self
def load_junctions_worker(all_jns, all_jns_lock, segments_queue, segments_queue_lock, reads): jns = defaultdict(list) while len(segments_queue) > 0: with segments_queue_lock: if len(segments_queue) == 0: break chrm, strand, start, stop = segments_queue.pop() if config.VERBOSE: config.log_statement("Finding jns in '%s:%s:%i:%i'" % (chrm, strand, start, stop)) jns[(chrm, strand)].extend( extract_junctions_in_region(reads, chrm, strand, start, stop, True)) # finally, block until we can offload the remaining junctions with all_jns_lock: for key, region_jns in jns.iteritems(): if key not in all_jns: all_jns_key = [] else: all_jns_key = all_jns[key] all_jns_key.extend(region_jns) all_jns[key] = all_jns_key del jns return
def merge_genes(all_sources_and_genes, ofp, sources_ofp): # write the gtf header ofp.write("track name=%s\n" % ofp.name) # group overlapping genes config.log_statement("Grouping genes", log=True) manager = multiprocessing.Manager() grpd_genes = manager.list() grpd_genes_lock = multiprocessing.Lock() for genes in group_overlapping_genes(all_sources_and_genes): grpd_genes.append(genes) # merge gene clustered transcripts config.log_statement("Merging transcripts", log=True) gene_id_cntr = multiprocessing.Value('i', 0) if config.NTHREADS == 1: merge_clustered_genes_worker( grpd_genes, grpd_genes_lock, ofp, sources_ofp, gene_id_cntr) else: pids = [] for i in xrange(config.NTHREADS): pid = os.fork() if pid == 0: merge_clustered_genes_worker( grpd_genes, grpd_genes_lock, ofp, sources_ofp, gene_id_cntr) os._exit(0) else: pids.append(pid) for pid in pids: os.waitpid(pid, 0) return
def load_junctions_worker(all_jns, all_jns_lock, segments_queue, segments_queue_lock, reads): jns = defaultdict(list) while len(segments_queue) > 0: with segments_queue_lock: if len(segments_queue) == 0: break chrm, strand, start, stop = segments_queue.pop() if config.VERBOSE: config.log_statement("Finding jns in '%s:%s:%i:%i'" % (chrm, strand, start, stop)) jns[(chrm, strand)].extend( extract_junctions_in_region( reads, chrm, strand, start, stop, True)) # finally, block until we can offload the remaining junctions with all_jns_lock: for key, region_jns in jns.iteritems(): if key not in all_jns: all_jns_key = [] else: all_jns_key = all_jns[key] all_jns_key.extend( region_jns ) all_jns[key] = all_jns_key del jns if config.VERBOSE: config.log_statement( "" ) return
def load_junctions_in_bam( reads, regions=None, nthreads=1): if regions == None: regions = [] for contig, contig_len in zip(*get_contigs_and_lens([reads,])): for strand in '+-': regions.append( (contig, strand, 0, contig_len) ) if nthreads == 1: jns = defaultdict(list) for chrm, strand, region_start, region_stop in regions: jns[(chrm, strand)].extend( extract_junctions_in_region( reads, chrm, strand, region_start, region_stop ) ) return jns else: from multiprocessing import Process, Manager manager = Manager() all_jns = manager.dict() all_jns_lock = multiprocessing.Lock() segments_queue = manager.list() segments_queue_lock = multiprocessing.Lock() for chrm, strand, region_start, region_stop in regions: # add all the regions to search for junctions in seg_len = min(5000, int((region_stop - region_start + 1)/nthreads)) pos = region_start while pos < region_stop: segments_queue.append( (chrm, strand, pos, pos+seg_len) ) pos += seg_len # make sure the last region doesnt exten past the stop segments_queue[-1] = ( chrm, strand, segments_queue[-1][2], region_stop) ps = [] for i in xrange(nthreads): p = Process(target=load_junctions_worker, args=( all_jns, all_jns_lock, segments_queue, segments_queue_lock, reads)) p.start() ps.append( p ) if config.VERBOSE: config.log_statement( "Waiting on jn finding children" ) while len(segments_queue) > 0: if config.VERBOSE: config.log_statement( "Waiting on jn finding children (%i in queue)" % len(segments_queue) ) time.sleep( 0.5 ) if config.VERBOSE: config.log_statement("Waiting on jn finding children (0 in queue)") for p in ps: p.join() #while any( not p.is_alive() for p in ps ): if config.VERBOSE: config.log_statement("Merging junctions from threads") junctions = {} for key in all_jns.keys(): junctions[key] = sorted(all_jns[key]) return junctions assert False
def load_junctions_in_bam(reads, regions=None, nthreads=1): if regions == None: regions = [] for contig, contig_len in zip(*get_contigs_and_lens([ reads, ])): for strand in '+-': regions.append((contig, strand, 0, contig_len)) if nthreads == 1: jns = defaultdict(list) for chrm, strand, region_start, region_stop in regions: jns[(chrm, strand)].extend( extract_junctions_in_region(reads, chrm, strand, region_start, region_stop)) return jns else: from multiprocessing import Process, Manager manager = Manager() all_jns = manager.dict() all_jns_lock = multiprocessing.Lock() segments_queue = manager.list() segments_queue_lock = multiprocessing.Lock() for chrm, strand, region_start, region_stop in regions: # add all the regions to search for junctions in seg_len = min(5000, int( (region_stop - region_start + 1) / nthreads)) pos = region_start while pos < region_stop: segments_queue.append((chrm, strand, pos, pos + seg_len)) pos += seg_len # make sure the last region doesnt exten past the stop segments_queue[-1] = (chrm, strand, segments_queue[-1][2], region_stop) ps = [] for i in xrange(nthreads): p = Process(target=load_junctions_worker, args=(all_jns, all_jns_lock, segments_queue, segments_queue_lock, reads)) p.start() ps.append(p) if config.VERBOSE: config.log_statement("Waiting on jn finding children") while len(segments_queue) > 0: if config.VERBOSE: config.log_statement( "Waiting on jn finding children (%i in queue)" % len(segments_queue)) time.sleep(0.5) if config.VERBOSE: config.log_statement("Waiting on jn finding children (0 in queue)") for p in ps: p.join() #while any( not p.is_alive() for p in ps ): if config.VERBOSE: config.log_statement("Merging junctions from threads") junctions = {} for key in all_jns.keys(): junctions[key] = sorted(all_jns[key]) return junctions assert False
def init(self, reverse_read_strand=None, reads_are_stranded=None, pairs_are_opp_strand=None, reads_are_paired=None, ref_genes=None): assert self.is_indexed() read_pair_params = determine_read_pair_params(self) # set whether the reads are paired or not if reads_are_paired in ('auto', None): if 'paired' in read_pair_params: reads_are_paired = True else: assert 'unpaired' in read_pair_params reads_are_paired = False if pairs_are_opp_strand in ('auto', None): if not reads_are_paired: pairs_are_opp_strand = None elif 'same_strand' in read_pair_params: pairs_are_opp_strand = False else: pairs_are_opp_strand = True if ( reads_are_stranded in ('auto', None) or reverse_read_strand in ('auto', None) ): read_strand_attributes = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'internal_exon', 300, 50, 10 ) config.log_statement( "read_strand_attributes = %s" % (read_strand_attributes,), log=True) if 'unstranded' in read_strand_attributes: if reads_are_stranded in ('auto', None): reads_are_stranded = False elif 'stranded' in read_strand_attributes: if reads_are_stranded in ('auto', None): reads_are_stranded = True else: assert False if config.VERBOSE: config.log_statement( "Set reads_are_stranded to '%s' for '%s'" % ( reads_are_stranded, self.filename), log=True ) if reverse_read_strand in ('auto', None): if not reads_are_stranded: reverse_read_strand = None elif 'reverse_read_strand' in read_strand_attributes: reverse_read_strand = True elif 'dont_reverse_read_strand' in read_strand_attributes: reverse_read_strand = False else: reverse_read_strand = None if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % ( reverse_read_strand, self.filename), log=True ) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand ) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'reads_are_stranded': reads_are_stranded, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired, 'ref_genes': ref_genes } return self
def parse_arguments(): import argparse parser = argparse.ArgumentParser(\ description='Produce simulated reads in a perfecty aligned BAM file.' ) # gtf is the only required argument parser.add_argument( 'gtf', type=file, \ help='GTF file from which to produce simulated reads ' + \ '(Note: Only the first trascript from this file will ' + \ 'be simulated)' ) parser.add_argument('--assay', choices=['RNAseq', 'RAMPAGE', 'CAGE', 'PASseq'], default='RNAseq', help='Which assay type to simulate from') # fragment length distribution options parser.add_argument( '--fl-dist-const', type=int, default=DEFAULT_FRAG_LENGTH, \ help='Constant length fragments. (default: ' + \ '%(default)s)' ) parser.add_argument( '--fl-dist-norm', \ help='Mean and standard deviation (format "mn:sd") ' + \ 'used to create normally distributed fragment lengths.' ) # files providing quality and sequnce information parser.add_argument( '--fasta', '-f', \ help='Fasta file from which to create reads ' + \ '(default: all sequences are "' + DEFAULT_BASE + \ '" * length of sequence)' ) parser.add_argument( '--quality', '-q', \ help='Flat file containing one FASTQ quality score ' + \ 'per line, created with get_quals.sh. (default: ' + \ 'quality strings are "' + str(DEFAULT_QUALITY_SCORE) + \ '" * length of sequence.)' ) # type and number of fragments requested parser.add_argument( '--num-frags', '-n', type=int, default=1000, help='Total number of fragments to create across all trascripts') parser.add_argument('--single-end', action='store_true', default=False, help='Produce single-end reads.') parser.add_argument('--paired-end', dest='single_end', action='store_false', help='Produce paired-end reads. (default)') # XXX not sure if this works #parser.add_argument( # '--full-fragment', action='store_true', default=False, # help='Produce reads spanning the entire fragment.') parser.add_argument( '--read-len', '-r', type=int, default=DEFAULT_READ_LENGTH, \ help='Length of reads to produce in base pairs ' + \ '(default: %(default)s)' ) # output options parser.add_argument( '--out_prefix', '-o', default='simulated_reads', \ help='Prefix for output FASTQ/BAM file ' + \ '(default: %(default)s)' ) parser.add_argument( '--verbose', '-v', default=False, action='store_true', \ help='Print status information.' ) args = parser.parse_args() # set to false, but we may want to bring this option back args.full_fragment = False global VERBOSE VERBOSE = args.verbose if args.assay == 'CAGE': args.read_len = 28 args.single_end = True # parse normal distribution argument if args.fl_dist_norm: try: mean, sd = args.fl_dist_norm.split(':') args.fl_dist_norm = [int(mean), int(sd)] except ValueError: args.fl_dist_norm = None config.log_statement( "WARNING: User input mean and sd are not formatted correctly.\n"+\ "\tUsing default values.\n") return (args.gtf, args.fl_dist_const, args.fl_dist_norm, args.fasta, args.quality, args.num_frags, args.single_end, args.full_fragment, args.read_len, args.out_prefix, args.assay)
num_unique_reads = 0.0 #config.log_statement("Finding reads in %s" % str((chrm, strand, r_start, r_stop))) for n_obs_reads, (read, rd_strand) in enumerate(reads.iter_reads_and_strand( chrm, r_start, r_stop+1)): # break if we've surpassed the read if read.pos > r_stop: break # -probability that the read originated in this location # if we can't find it, assume that it's uniform over alternate # mappings. If we can't find that, then assume that it's unique map_prb = get_rd_posterior_prb(read) if n_obs_reads > 0 and n_obs_reads%100000 == 0: config.log_statement("Processed %i reads in %s" % ( n_obs_reads, str((chrm, strand, r_start, r_stop)))) for jn in junctions.iter_jns_in_read(read): # skip jns whose start does not overlap this region, we subtract one # because the start refers to the first covered intron base, and # we are talking about covered regions if jn[0]-1 < r_start or jn[0]-1 > r_stop: continue jn_reads[rd_strand][jn] += 1 # if this is an anti-strand read, then we only care about the jns if strand != '.' and rd_strand != strand: continue # extract the information we care about: # -strand, alread have this # -regions covered cov_regions = list(iter_coverage_intervals_for_read(read)) # -read length read_len = read.inferred_length
reg_len = r_stop-r_start+1 jn_reads = {'+': defaultdict(int), '-': defaultdict(int)} cov = { '+': numpy.zeros(reg_len, dtype=float), '-': numpy.zeros(reg_len, dtype=float) } pair1_reads = defaultdict(list) pair2_reads = defaultdict(list) num_unique_reads = 0.0 config.log_statement("Finding reads in %s" % str((chrm, strand, r_start, r_stop))) for n_obs_reads, (read, rd_strand) in enumerate(reads.iter_reads_and_strand( chrm, r_start, r_stop+1)): # break if we've surpassed the read if read.pos > r_stop: break # -probability that the read originated in this location # if we can't find it, assume that it's uniform over alternate # mappings. If we can't find that, then assume that it's unique map_prb = get_rd_posterior_prb(read) if n_obs_reads > 0 and n_obs_reads%100000 == 0: config.log_statement("Processed %i reads in %s" % ( n_obs_reads, str((chrm, strand, r_start, r_stop)))) for jn in junctions.iter_jns_in_read(read):
def init(self, reverse_read_strand=None, reads_are_stranded=None, pairs_are_opp_strand=None, reads_are_paired=None, ref_genes=None): assert self.is_indexed() read_pair_params = determine_read_pair_params(self) # set whether the reads are paired or not if reads_are_paired in ('auto', None): if 'paired' in read_pair_params: reads_are_paired = True else: assert 'unpaired' in read_pair_params reads_are_paired = False if pairs_are_opp_strand in ('auto', None): if not reads_are_paired: pairs_are_opp_strand = None elif 'same_strand' in read_pair_params: pairs_are_opp_strand = False else: pairs_are_opp_strand = True if ( reads_are_stranded in ('auto', None) or reverse_read_strand in ('auto', None) ): read_strand_attributes = determine_read_strand_params( self, ref_genes, pairs_are_opp_strand, 'internal_exon', 300, 50 ) if 'unstranded' in read_strand_attributes: if reads_are_stranded in ('auto', None): reads_are_stranded = False elif 'stranded' in read_strand_attributes: if reads_are_stranded in ('auto', None): reads_are_stranded = True else: assert False if config.VERBOSE: config.log_statement( "Set reads_are_stranded to '%s' for '%s'" % ( reads_are_stranded, self.filename), log=True ) if reverse_read_strand in ('auto', None): if not reads_are_stranded: reverse_read_strand = None elif 'reverse_read_strand' in read_strand_attributes: reverse_read_strand = True elif 'dont_reverse_read_strand' in read_strand_attributes: reverse_read_strand = False else: assert False if config.VERBOSE: config.log_statement( "Set reverse_read_strand to '%s' for '%s'" % ( reverse_read_strand, self.filename), log=True ) Reads.init(self, reads_are_paired, pairs_are_opp_strand, reads_are_stranded, reverse_read_strand ) self._init_kwargs = { 'reverse_read_strand': reverse_read_strand, 'reads_are_stranded': reads_are_stranded, 'pairs_are_opp_strand': pairs_are_opp_strand, 'reads_are_paired': reads_are_paired } return self