def filter_chimeras(input_file, output_file, index_dir, bam_file, weighted_unique_frags, median_isize, max_isize, isoform_fraction, false_pos_file): logging.debug("Filtering Parameters") logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags)) logging.debug("\tmedian insert size: %d" % (median_isize)) logging.debug("\tmax insert size allowed: %d" % (max_isize)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Parsing false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Checking chimeras") num_chimeras = 0 num_filtered_chimeras = 0 tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt") f = open(tmp_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_weighted_frags(c, weighted_unique_frags) if not good: continue good = good and filter_inner_dist(c, max_isize) if not good: continue false_pos_key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction( c, isoform_fraction, median_isize, bamfh) if good: print >> f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(tmp_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) os.remove(tmp_file) return config.JOB_SUCCESS
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)
def filter_chimeras(input_file, output_file, index_dir, bam_file, weighted_unique_frags, median_isize, max_isize, isoform_fraction, false_pos_file): logging.debug("Filtering Parameters") logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags)) logging.debug("\tmedian insert size: %d" % (median_isize)) logging.debug("\tmax insert size allowed: %d" % (max_isize)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Parsing false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Checking chimeras") num_chimeras = 0 num_filtered_chimeras = 0 tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt") f = open(tmp_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_weighted_frags(c, weighted_unique_frags) if not good: continue good = good and filter_inner_dist(c, max_isize) if not good: continue false_pos_key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, median_isize, bamfh) if good: print >>f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(tmp_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) os.remove(tmp_file) return config.JOB_SUCCESS