def nominate_spanning_reads(discordant_reads_fh, chimeras_fh, fastq_fh): # build index of chimera candidates logging.info("Indexing chimera candidates") tx5p = collections.defaultdict(lambda: []) tx3p = collections.defaultdict(lambda: []) for chimera in Chimera.parse(chimeras_fh): tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end) tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start) # parse discordant reads logging.info("Nominating spanning reads") read1, read2 = None, None prev_qname = None for frag in parse_discordant_reads(discordant_reads_fh): if frag.discordant_type.is_genome: continue qname = frag.qname if prev_qname is not None and (qname != prev_qname): if read1 is not None: print >> fastq_fh, read1 if read2 is not None: print >> fastq_fh, read2 read1, read2 = None, None # skip if reads already found if (read1 is not None) and (read2 is not None): continue # update read fastq r1, r2 = check_fragment(frag, tx5p, tx3p) if read1 is None: read1 = r1 if read2 is None: read2 = r2 prev_qname = qname if read1 is not None: print >> fastq_fh, read1 if read2 is not None: print >> fastq_fh, read2
def filter_encompassing_chimeras(input_file, output_file, gene_file, max_multimap=1, multimap_cov_ratio=0.10, max_isize=1000, strand_pval=0.01, keep_overlap=False): logging.debug("Filtering chimeras") logging.debug("Must have a read with <= %d multimaps" % (max_multimap)) logging.debug("Coverage to reads ratio >= %f" % (multimap_cov_ratio)) logging.debug("Insert size < %d" % (max_isize)) logging.debug("Strand balance p-value > %f" % (strand_pval)) # first perform basic filtering tmpfile1 = make_temp(base_dir=os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile1, "w") for c in Chimera.parse(open(input_file)): res = filter_multimapping(c, max_multimap=max_multimap, multimap_cov_ratio=multimap_cov_ratio) res = res and filter_insert_size(c, max_isize) if not keep_overlap: res = res and filter_overlapping(c) res = res and filter_strand_balance(c, strand_pval) if res: print >>fh, '\t'.join(map(str, c.to_list())) fh.close() logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Finding junction permiscuity") juncmap5p, juncmap3p = collect_permiscuity_stats(tmpfile1, ggmap) fh = open(output_file, "w") for c in Chimera.parse(open(tmpfile1)): frac5p, frac3p = calc_permiscuity(c, juncmap5p, juncmap3p, ggmap) c.mate5p.frac = frac5p c.mate3p.frac = frac3p print >>fh, '\t'.join(map(str, c.to_list())) fh.close() # delete tmp files os.remove(tmpfile1)
def nominate_spanning_reads(discordant_reads_fh, chimeras_fh, fastq_fh): # build index of chimera candidates logging.info("Indexing chimera candidates") tx5p = collections.defaultdict(lambda: []) tx3p = collections.defaultdict(lambda: []) for chimera in Chimera.parse(chimeras_fh): tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end) tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start) # parse discordant reads logging.info("Nominating spanning reads") read1, read2 = None, None prev_qname = None for frag in parse_discordant_reads(discordant_reads_fh): if frag.discordant_type.is_genome: continue qname = frag.qname if prev_qname is not None and (qname != prev_qname): if read1 is not None: print >>fastq_fh, read1 if read2 is not None: print >>fastq_fh, read2 read1, read2 = None, None # skip if reads already found if (read1 is not None) and (read2 is not None): continue # update read fastq r1, r2 = check_fragment(frag, tx5p, tx3p) if read1 is None: read1 = r1 if read2 is None: read2 = r2 prev_qname = qname if read1 is not None: print >>fastq_fh, read1 if read2 is not None: print >>fastq_fh, read2
def collect_permiscuity_stats(input_file, ggmap): # break name into 5'/3' genes linked in a dictionary logging.debug("Building chimera permiscuity map") juncmap5p, juncmap3p = \ build_junc_permiscuity_map(Chimera.parse(open(input_file)), ggmap) return juncmap5p, juncmap3p