def rank_chimeras(input_file, output_file, empirical_prob): ''' rank the chimeras according to the empirical distribution of encompassing read coverage, spanning read coverage, and junction permiscuity ''' # profile the chimeras arr = [] for c in SpanningChimera.parse(open(input_file)): arr.append(get_ranking_props(c)) arr = np.array(arr) # choose bin sizes maxbins = 500 bins = [] for d in xrange(arr.shape[1]): bins.append(get_quantiles(arr[:,d], np.linspace(0, 1, maxbins))) H, edges = np.histogramdd(arr, bins=bins) #N = np.sum(H) # now rank each chimera using the empirical distribution chimera_scores = [] for c in SpanningChimera.parse(open(input_file)): props = get_ranking_props(c) p = hist_interp_prob(H, edges, props) chimera_scores.append((1-p, c)) outfh = open(output_file, "w") sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0)) empirical_probs = np.array([x[0] for x in sorted_chimera_scores]) prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob) print >>outfh, '\t'.join(['#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name', 'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance', 'encompassing_reads', 'encompassing_reads_plus', 'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p', 'exons5p', 'exons3p', 'junction_permiscuity5p', 'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1', 'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p', 'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads', 'spanning_info', 'breakpoint_hist', 'empirical_prob']) for p,c in sorted_chimera_scores: if p > prob_cutoff: break arr = get_anchor_hist(c) arrstring = ','.join([str(round(x,1)) for x in arr]) print >>outfh, '\t'.join(map(str, c.to_list() + [arrstring, p])) outfh.close()
def choose_highest_coverage_chimeras(input_file, ggmap): ''' choose the highest coverage isoform pair using spanning reads, encompassing reads, and total reads as a measure. ties will be broken by choosing a single gene pair arbitrarily ''' # break name into 5'/3' genes linked in a dictionary logging.debug("Building junction isoform coverage map") kept_isoforms_set = build_junc_coverage_map(SpanningChimera.parse(open(input_file)), ggmap) # write results logging.debug("Returning highest coverage chimeras") for c in SpanningChimera.parse(open(input_file)): pairkey = (c.mate5p.tx_name, c.mate3p.tx_name) if pairkey in kept_isoforms_set: yield c del kept_isoforms_set
def rank_chimeras(input_file, output_file, empirical_prob): ''' rank the chimeras according to the empirical distribution of encompassing read coverage, spanning read coverage, and junction permiscuity ''' # profile the chimeras arr = [] for c in SpanningChimera.parse(open(input_file)): arr.append(get_ranking_props(c)) arr = np.array(arr) # choose bin sizes maxbins = 500 bins = [] for d in xrange(arr.shape[1]): bins.append(get_quantiles(arr[:, d], np.linspace(0, 1, maxbins))) H, edges = np.histogramdd(arr, bins=bins) #N = np.sum(H) # now rank each chimera using the empirical distribution chimera_scores = [] for c in SpanningChimera.parse(open(input_file)): props = get_ranking_props(c) p = hist_interp_prob(H, edges, props) chimera_scores.append((1 - p, c)) outfh = open(output_file, "w") sorted_chimera_scores = sorted(chimera_scores, key=operator.itemgetter(0)) empirical_probs = np.array([x[0] for x in sorted_chimera_scores]) prob_cutoff = scoreatpercentile(empirical_probs, empirical_prob) print >> outfh, '\t'.join([ '#gene5p', 'start5p', 'end5p', 'gene3p', 'start3p', 'end3p', 'name', 'weighted_cov', 'strand5p', 'strand3p', 'type', 'distance', 'encompassing_reads', 'encompassing_reads_plus', 'encompassing_reads_minus', 'multimap_hist', 'isize5p', 'isize3p', 'exons5p', 'exons3p', 'junction_permiscuity5p', 'junction_permiscuity3p', 'encompassing_ids', 'encompassing_read1', 'encompassing_read2', 'junction_id', 'junction_pos', 'homology5p', 'homology3p', 'spanning_reads', 'encomp_and_spanning', 'total_reads', 'spanning_info', 'breakpoint_hist', 'empirical_prob' ]) for p, c in sorted_chimera_scores: if p > prob_cutoff: break arr = get_anchor_hist(c) arrstring = ','.join([str(round(x, 1)) for x in arr]) print >> outfh, '\t'.join(map(str, c.to_list() + [arrstring, p])) outfh.close()
def choose_highest_coverage_chimeras(input_file, ggmap): ''' choose the highest coverage isoform pair using spanning reads, encompassing reads, and total reads as a measure. ties will be broken by choosing a single gene pair arbitrarily ''' # break name into 5'/3' genes linked in a dictionary logging.debug("Building junction isoform coverage map") kept_isoforms_set = build_junc_coverage_map( SpanningChimera.parse(open(input_file)), ggmap) # write results logging.debug("Returning highest coverage chimeras") for c in SpanningChimera.parse(open(input_file)): pairkey = (c.mate5p.tx_name, c.mate3p.tx_name) if pairkey in kept_isoforms_set: yield c del kept_isoforms_set
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >>fh, '\t'.join(['\t'.join(map(str,c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)
def filter_spanning_chimeras(input_file, output_file, gene_file, mate_pval, max_isize): ''' processes chimera isoforms and chooses the one with the highest coverage and omits the rest ''' # apply more filters tmpfile = make_temp(os.path.dirname(output_file), suffix='.bedpe') fh = open(tmpfile, "w") for c in SpanningChimera.parse(open(input_file)): res = filter_insert_size(c, max_isize) if res: print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # choose best isoform from remaining isoforms logging.debug("Building gene/genome index") ggmap = build_gene_to_genome_map(open(gene_file)) logging.debug("Choosing highest coverage chimeras") fh = open(output_file, "w") for c in choose_highest_coverage_chimeras(tmpfile, ggmap): print >> fh, '\t'.join(['\t'.join(map(str, c.to_list()))]) fh.close() # remove temporary file os.remove(tmpfile)