def filter_encompassing_chimeras(input_file, output_file, gene_file, max_multimap=1, multimap_cov_ratio=0.10, max_isize=1000, strand_pval=0.01, keep_overlap=False): logging.debug("Filtering chimeras") logging.debug("Must have a read with <= %d multimaps" % (max_multimap)) logging.debug("Coverage to reads ratio >= %f" % (multimap_cov_ratio)) logging.debug("Insert size < %d" % (max_isize)) logging.debug("Strand balance p-value > %f" % (strand_pval)) # first perform basic filtering tmpfile1 = make_temp(base_dir=os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile1, "w") for c in Chimera.parse(open(input_file)): res = filter_multimapping(c, max_multimap=max_multimap, multimap_cov_ratio=multimap_cov_ratio) res = res and filter_insert_size(c, max_isize) if not keep_overlap: res = res and filter_overlapping(c) res = res and filter_strand_balance(c, strand_pval) if res: print >>fh, '\t'.join(map(str, c.to_list())) fh.close() logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Finding junction permiscuity") juncmap5p, juncmap3p = collect_permiscuity_stats(tmpfile1, ggmap) fh = open(output_file, "w") for c in Chimera.parse(open(tmpfile1)): frac5p, frac3p = calc_permiscuity(c, juncmap5p, juncmap3p, ggmap) c.mate5p.frac = frac5p c.mate3p.frac = frac3p print >>fh, '\t'.join(map(str, c.to_list())) fh.close() # delete tmp files os.remove(tmpfile1)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)