def get_hap_coverage(in_bam, ps_h5, chrom, start, stop, cov_quals): """Return a dataframe with coverage per haplotype. Args: - in_bam: reader for a position sorted bam - ps_h5: HDF5 with phase set coordinates - chrom, start, stop: region to get coverage - cov_quals: Array of MAPQ cutoffs. Return value: A dataframe with columns: - chrom - pos - cov_q<M>_hap<H> for all M in cov_quals and for H in [0, 1, 2]: This is the coverage on haplotype H using reads of MAPQ >= M. Haplotype 2 corresponds to unphased. - phase_set: null if ps_h5 is missing. """ coverages = [np.zeros((stop - start, 3)) for _ in cov_quals] for _, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))): if not read.is_unmapped and not read.aend is None and not read.is_secondary and not read.is_duplicate: hap = tk_io.get_read_haplotype(read) hap_idx = 2 if hap is None else hap - 1 range_start = max(0, read.pos - start) range_stop = min(stop, read.aend) - start for qi, q in enumerate(cov_quals): if read.mapq >= q: coverages[qi][range_start:range_stop + 1, hap_idx] += 1 base_df = pd.DataFrame({'chrom': chrom, 'pos': np.arange(start, stop)}) dfs = map( lambda x: pd.DataFrame( x[0], columns=['cov_q' + str(x[1]) + '_hap' + str(i) for i in range(3)]), zip(coverages, cov_quals)) df = pd.concat([base_df, pd.concat(dfs, axis=1)], axis=1) phase_sets = -np.ones((stop - start, ), dtype=np.int) # This can be None if for example the input is unbarcoded. if not ps_h5 is None: ps_df = tk_hdf5.read_data_frame(ps_h5) ps_df = ps_df[np.logical_and( ps_df.chrom == chrom, np.logical_and(ps_df.end >= start, ps_df.start < stop))] for _, row in ps_df.iterrows(): range_start = max(0, row.start - start) range_stop = min(stop, row.end) - start phase_sets[range_start:range_stop + 1] = row.phase_set df['phase_set'] = phase_sets return df
def _get_sliced_df(h5file, column_names, row_slices, id_column=None): columns = [(name, h5file[name], get_levels(h5file[name])) for name in column_names] result_cols = {} for (name, ds, translate) in columns: if len(row_slices) > 0: row_slices.sort() rows = np.concatenate( [ds[start:end] for (start, end) in row_slices]) else: # we'll return an empty data frame if there are no slices # np.concatenate fail with 0-length input rows = np.array([], dtype=ds.dtype) if translate is not None: rows = translate[rows] result_cols[name] = rows if len(row_slices) > 0: id_column_values = np.concatenate( [np.arange(start, end) for (start, end) in row_slices]) else: id_column_values = np.array([], dtype=np.int32) if id_column is not None: result_cols[id_column] = id_column_values df = p.DataFrame(result_cols) df.index = id_column_values return df
def read_data_frame_limited(fn, query_cols=[], max_rows=None): ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns ''' with h5py.File(fn, 'r') as f: column_names = f.attrs.get("column_names") column_names = get_column_intersection(column_names, query_cols) sz = f[column_names[0]].shape[0] if max_rows: sz = min(sz, max_rows) df = p.DataFrame() # Add the columns progressively to save memory for name in column_names: ds = f[name] if has_levels(ds): indices = ds[:sz] uniques = get_levels(ds) # This method of constructing of Categorical avoids copying the indices array # which saves memory for big datasets df[name] = p.Categorical(indices, categories=uniques, ordered=False, fastpath=True) else: df[name] = p.Series(ds[:sz]) return df
def make_df_chunk(fragments, bcs): # No BC results -- will write an empty file if len(bcs) == 0: return (None, None) else: fragment_df = p.DataFrame(fragments) bc_df = p.DataFrame(bcs) # Set good types for the fragment data frame to reduce size fragment_df.start_pos = fragment_df.start_pos.astype(np.int32) fragment_df.end_pos = fragment_df.end_pos.astype(np.int32) fragment_df.obs_len = fragment_df.obs_len.astype(np.int32) fragment_df.est_len = fragment_df.est_len.astype(np.int32) fragment_df.num_reads = fragment_df.num_reads.astype(np.int32) fragment_df.num_reads_se = fragment_df.num_reads_se.astype(np.int32) fragment_df.bc_num_reads = fragment_df.bc_num_reads.astype(np.int32) fragment_df.bc_est_len = fragment_df.bc_est_len.astype(np.int32) fragment_df.bc_mean_reads_per_fragment = fragment_df.bc_mean_reads_per_fragment.astype( np.float32) return (fragment_df, bc_df)
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.possorted_bam) chrom = args.chrom poses = [] mol_qs = [] bcs = [] for read in in_bam.fetch(str(chrom), int(args.start_pos), int(args.end_pos)): if not read.is_secondary and not read.is_duplicate and read.is_read1 and \ not read.is_unmapped and read.mapq >= args.mapq: poses.append(read.pos) mol_qs.append(tk_io.get_read_molecule_conf(read)) bcs.append(tk_io.get_read_barcode(read)) ret_df = pd.DataFrame({ 'chrom': chrom, 'pos': poses, 'bc': bcs, 'mol_qual': mol_qs }) if len(ret_df) > 0: start_pos = poses[0] end_pos = poses[-1] cov_df = tk_hdf5.read_data_frame_indexed( args.coverage, [(chrom, start_pos, end_pos + 1)]) # Boolean array with length equal to the range of positions in ret_df on_target = np.zeros((end_pos - start_pos + 1, ), dtype=np.bool) on_target[cov_df.pos - start_pos] = True cum_on_target = np.cumsum(on_target) ret_df['on_target'] = on_target[ret_df.pos - start_pos] # Note that fraction is set to 1 if there are no bases frac_on_target = np.ones((len(ret_df), )) * on_target[0] for i, p in enumerate(poses): if i > 0: nbp = float(p - poses[i - 1] - 1) if nbp > 0: frac_on_target[i] = ( cum_on_target[p - start_pos] - cum_on_target[poses[i - 1] - start_pos] - int(on_target[p - start_pos])) / nbp else: frac_on_target[i] = float(on_target[p - start_pos]) ret_df['frac_on_target'] = frac_on_target else: ret_df['on_target'] = False ret_df['frac_on_target'] = 0.0 tk_hdf5.write_data_frame(outs.reads, ret_df)
def get_reads(in_bam, chrom, start, stop, in_read_df=None, min_mapq=30, max_reads=500000, blacklist_barcodes=None): poses = [] ends = [] bcs = [] if not in_read_df is None and len(in_read_df) > 0: ret_df = in_read_df.sort('pos') old_poses = np.array(ret_df['pos']) # Subtracting the read length is roughly right, ideally we should sort # by aend. # Loci are considered in an ordered fashion, so we should never fetch # reads "earlier" in the bam. start = max(old_poses[0], max(0, start - MAX_READ_LEN)) if start >= old_poses[0] and start <= old_poses[-1]: start_idx = bisect.bisect_left(old_poses, start) if stop >= old_poses[0] and stop <= old_poses[-1]: stop_idx = min(len(ret_df), bisect.bisect(old_poses, stop)) else: stop_idx = len(ret_df) # Remove all positions that are smaller than the input start ret_df = ret_df.iloc[start_idx:stop_idx] # Set the new start to the end of the input data frame. # Add an overlap of READ_LEN to capture reads that were right on # the boundary between the old and new data frame. start = max(0, old_poses[stop_idx - 1] - MAX_READ_LEN) stop = max(start, stop) else: ret_df = None for i, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))): if i > max_reads: break bc = tk_io.get_read_barcode(read) if read.pos < start: continue if not blacklist_barcodes is None and bc in blacklist_barcodes: continue if not read.is_secondary and not read.is_duplicate and read.is_read1 and \ not read.is_unmapped and read.mapq >= min_mapq and read.is_proper_pair and \ not bc is None: poses.append(read.pos) ends.append(read.aend) bcs.append(tk_io.get_read_barcode(read)) tmp_ret_df = pd.DataFrame({'chrom':chrom, 'pos':poses, 'aend':ends, 'bc':bcs}) ret_df = pd.concat([ret_df, tmp_ret_df], ignore_index=True) ret_df.sort(['bc', 'pos'], inplace=True) return ret_df
def merge_calls_and_gt(call_df, gt_df, call_to_gt): if not gt_df is None: gt_df.index = gt_df['name'] else: call_to_gt = {} out_call_df = None for _, row in call_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) orient = tk_sv_io.get_break_orientation(row.info) row['orient'] = orient # revert sv type name from DISTAL to TRANS to match ground truth # conventions if sv_type == 'DISTAL': sv_type = 'TRANS' row['sv_type'] = sv_type matches = list(call_to_gt.get(row['name'], [None])) # One output row per match for m in matches: row['match'] = m if not m is None and not gt_df is None: x = gt_df.loc[m] row['match_dist'] = max( dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1, x.stop1), dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2, x.stop2)) else: row['match_dist'] = float('NaN') out_call_df = pd.concat( [out_call_df, pd.DataFrame([row])], ignore_index=True) if not gt_df is None: out_call_df = pd.merge(out_call_df, gt_df, left_on='match', right_on='name', how='outer', suffixes=['', '_gt']) out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True) out_call_df.sort('name', inplace=True) return out_call_df
def join(args, outs, chunk_defs, chunk_outs): # Combine the coverage hdf5 files frame = p.DataFrame() list_ = [] if args.baits_file_map and outs.bait_csv: in_files = [ out.bait_csv for (cdef, out) in zip(chunk_defs, chunk_outs) ] for file_ in in_files: df = p.read_csv(file_, index_col=None, header=0) list_.append(df) frame = p.concat(list_) # write csv frame.to_csv(outs.bait_csv) else: outs.target_coverage = None
def get_reads(in_bam, chrom, start, stop, min_mapq=60): bcs = [] poses = [] for read in in_bam.fetch(str(chrom), start, stop): mapq = read.mapq if mapq < min_mapq or read.is_secondary or read.is_duplicate: continue bc = tk_io.get_read_barcode(read) if bc is None: continue bcs.append(bc) poses.append(read.pos) df = pd.DataFrame({'pos': poses, 'bc': bcs}) df.sort('bc', inplace=True) return df
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if out_bedpe is None: col_names = ['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name', 'qual', 'strand1', 'strand2', 'filters', 'info'] out_bedpe = pd.DataFrame(columns=col_names) out_bedpe.names = np.arange(len(out_bedpe)) out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv] tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
def read_data_frame_filtered(fn, filter_func, query_cols=[], chunk_size=5000000): ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns. filter_func should take a DataFrame and return a boolean vector of the rows to keep. Rows are loaded from the file and filtered in chunks to keep peak memory usage small. ''' f = h5py.File(fn, 'r') column_names = f.attrs.get("column_names") column_names = get_column_intersection(column_names, query_cols) column_index = p.Index(column_names) sz = f[column_names[0]].shape[0] starts = np.arange(0, sz, chunk_size) ends = np.minimum(sz, starts + chunk_size) chunks = [] for (start, end) in zip(starts, ends): cols = {} for name in column_names: ds = f[name] if has_levels(ds): indices = ds[start:end] uniques = get_levels(ds) col = uniques[indices] else: col = ds[start:end] cols[name] = col df = p.DataFrame(cols, columns=column_index) df = df[filter_func(df)] if len(df) > 0 or len(chunks) == 0: chunks.append(df) f.close() result = p.concat(chunks, ignore_index=True) return result
def get_break_groups(bedpe_df, merge_win=10000, max_range=np.inf): """A simplified version of merge_breaks""" if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i, (n, row) in enumerate(bedpe_df.iterrows()): breaks.append((row.chrom1, row.start1, row.stop1, (n, 1))) breaks.append((row.chrom2, row.start2, row.stop2, (n, 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = defaultdict(list) for i, (n, row) in enumerate(bedpe_df.iterrows()): cluster_idx1 = mem_to_cluster[(n, 1)] cluster_idx2 = mem_to_cluster[(n, 2)] cluster_pairs[(cluster_idx1, cluster_idx2)].append(n) return cluster_pairs.values()
def get_depth_info(read_iter, chrom, cstart, cend): depths = np.zeros(cend - cstart, np.int32) for read in read_iter: pos = read.pos rstart = max(pos, cstart) # Increment to the end of the window or the end of the # alignment, whichever comes first rend = min(read.aend, cend) depths[(rstart - cstart):(rend - cstart)] += 1 positions = np.arange(cstart, cend, dtype=np.int32) depth_df = pd.DataFrame({ "chrom": chrom, "pos": positions, "coverage": depths }) return depth_df
def main(args, outs): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) sv_df["info2"] = "SV" cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants) cnv_df["info2"] = "CNV" sv_df = pd.concat([sv_df, cnv_df], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) sv_df.sort(['chrom1', 'chrom2'], inplace=True) res_df = None for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']): tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], inplace=True) # cluster the loci in the group based on proximity groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist) # for each cluster, get the row with max qual # tmp_df.loc[g] gets the subset of tmp_df in the cluster. # then idxmax gets the max index out_df = pd.DataFrame(columns=sv_df.columns) idx = 0 for g in groups: row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()] if (tmp_df.loc[g]['info2'] == 'SV').any(): row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()] source = list(set(tmp_df.loc[g]['info2'])) row['info'] += (";SOURCE=" + ",".join(source)) out_df.loc[idx] = row idx += 1 out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True) res_df = pd.concat([res_df, out_df], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
def test_basic(self): self.clear_directory() self.run_stage(self.args, "chr1:0..10000000") # Load all the output files variants = tenkit.hdf5.read_data_frame(os.path.join(job_dir, "variants.h5")) genes = p.read_csv(os.path.join(job_dir, "gene_stats.csv")) with open(os.path.join(job_dir, "summary.json")) as summary_file: summary = json.load(summary_file) print variants.head() # Check basic phasing metrics (verified with independent code) self.assertEqual(summary['N50_phase_block'], 362897) self.assertEqual(summary['longest_phase_block'], 1588263) np.testing.assert_almost_equal(summary['mean_phase_block'], 7067.99817851) # Hand calculated snp-weighted probabilities np.testing.assert_almost_equal(summary['fract_genes_phased'], 0.44155844155844154, decimal = 10) np.testing.assert_almost_equal(summary['fract_genes_completely_phased'], 0.1038961038961039, decimal = 10) self.assertEqual(summary['prob_snp_phased_in_gene'], 0.36608192963189135) self.assertEqual(summary['prob_snp_correct_in_gene'], 0.950884655460024) # Hand test some variant metrics gene_name = "KIAA1751" gene = genes[genes.gene == gene_name] self.assertEqual(gene.pair_phased_rate.values[0], float(79*78)/(143*142)) np.testing.assert_almost_equal(gene.pair_correct_rate.values[0], float(77*76/2)/(78*77/2)) self.assertEqual(gene.start.values[0], 1884751) self.assertEqual(gene.end.values[0], 1935276) # Hand test some variant metrics gene_name = "AJAP1" gene = genes[genes.gene == gene_name] np.testing.assert_almost_equal(gene.pair_phased_rate.values[0], float(149*148)/(184*183)) np.testing.assert_almost_equal(gene.pair_correct_rate.values[0], float((143*142)/2 + 1) / (145*144/2)) self.assertEqual(gene.start.values[0], 4715104) self.assertEqual(gene.end.values[0], 4843851) sample_variant_df = p.DataFrame({'in_obs':[ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ], 'in_gt': [ 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ], 'variant_type':['S','S','D','I','S','C','D','I','D','S','I','I','D','S','C','D'], 'FILTER': [ 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 ], 'variant_length': [ 0 , 0 , -1, 1 , 0 , 1 , -1, 1 , -6, 0 , 6 , 7 , 5 , 0 , 6 , 1 ]}) stats = {} filter_states = ["unfiltered", "filtered"] var_types = ["any", "snp", "insertion", "deletion", "complex", "insertion_lt5bp", "deletion_lt5bp"] for filter_condition in filter_states: for var_type in var_types: compute_snpindel_stats(stats, filter_condition, var_type, None, sample_variant_df, True) print stats #not covering all, but covering each dimension at least once self.assertEqual(stats['tps_filtered'], 2) self.assertEqual(stats['tps_unfiltered'], 4) self.assertEqual(stats['tps_filtered_snp'], 1) self.assertEqual(stats['tps_filtered_deletion'], 1) self.assertEqual(stats['tps_filtered_complex'], 0) self.assertEqual(stats['sensitivity_unfiltered'], 0.5) self.assertEqual(stats['ppv_unfiltered'], 0.5) self.assertEqual(stats['sensitivity_filtered'], 0.25) self.assertEqual(stats['ppv_filtered'], 0.5) self.assertEqual(stats['fps_filtered'], 2) self.assertEqual(stats['fns_unfiltered_insertion_lt5bp'], 1) self.assertEqual(stats['tps_filtered_deletion_lt5bp'], 1) self.assertEqual(stats['ppv_unfiltered_snp'], 0.5) self.assertEqual(stats['sensitivity_unfiltered_complex'], 0.5) self.assertEqual(stats['fns_unfiltered'], 4) self.assertEqual(stats['fns_filtered_deletion_lt5bp'], 1) # Test that some blocks of short switches are caught correctly ss = variants[variants.pos.isin([1529457,1529511])] self.assertTrue(ss.short_switch.all()) # Used to be a long switch - now should be nothinh non_long = variants[variants.pos == 1529950] self.assertTrue((non_long.short_switch == False).all()) self.assertTrue((non_long.long_switch == False).all()) switch_fixes = variants[np.logical_and(variants.pos >= 3173228, variants.pos <= 3181098)] self.assertTrue((switch_fixes.long_switch == False).all()) self.assertEqual(switch_fixes.short_switch.sum(), 5)
def coalescence_analysis(self, min_cluster_size=2, fpr=1.0): ''' Compute the BC-BC overlap matrix, threshold it and convert to a graph, and report large clusters ''' # Sizes (num_bcs, total_bins) = self.mat.shape # number of fragment on each bc self.num_frags = np.array((self.mat > 0.0).sum(axis=1)).flatten() martian.log_info("mean num frags: %f, median: %f" % (self.num_frags.mean(), np.median(self.num_frags))) # compute an initial threshold for coalescence # first, compute the expected number of overlaps between two BCs having the mean number of fragments expected_overlaps = (self.num_frags.mean()** 2) / self.effective_genome_bins # now use a ~5 sigma threshold to represent an initial cutoff for significance # note: this is more informative for GemCode than Chromium, because there are many more fragments per BC # and thus the expected number of overlaps due to chance is much higher. for Chromium, the threshold will usually be 2. overlap_threshold = np.float32( max(2, round(expected_overlaps + 5 * np.sqrt(expected_overlaps)))) martian.log_info( "expected overlaps: %f -- using overlap threshold: %f" % (expected_overlaps, overlap_threshold)) # Chunk out matrix in x and y and find significant overlaps in each pair of chunks bc_bin_size = 1000 bc_bins = np.arange(0, num_bcs, bc_bin_size) # Choose a p-value that accounts for many comparisons n_comparisons = num_bcs**2 pvalue_cut = fpr / n_comparisons martian.log_info("using pvalue cutoff: %e" % pvalue_cut) for x in bc_bins: martian.log_info("BC number: %d on x axis" % x) for y in bc_bins: # calculation is symmetric -- don't do below the diagonal if y < x: continue self.window_intersection_slices(x, y, bc_bin_size, pvalue_cut, overlap_threshold) # Delete the fragment matrix to save memory del self.mat self.mat = None martian.log_info("Finding connected components") clusters = self.clusters() bad_bcs = [] cluster_idxs = [] cluster_size = [] martian.log_info("Making bad BC DataFrame") for (cluster_idx, (_, nodes)) in enumerate(clusters.iteritems()): if len(nodes) < min_cluster_size: continue bad_bcs.extend(nodes) cluster_idxs.extend([cluster_idx] * len(nodes)) cluster_size.extend([len(nodes)] * len(nodes)) bad_bc_tbl = p.DataFrame({ 'bc': bad_bcs, 'cluster_id': cluster_idxs, 'cluster_size': cluster_size }) return bad_bc_tbl
def merge_breaks(bedpe_df, out_bedpe, merge_win=10000, max_range=np.inf, max_nmates=np.inf, cluster_qual_factor=0.2): """Merges a set of SVs into a non-redundant set. Args: - bedpe_df: Either a bedpe file or a DataFrame like the one returned by tk_sv_io.read_sv_bedpe_to_df. - out_bedpe: Path to file where output will be written. - merge_win: Breakpoints will be merged if they are within this distance from each other. Two SVs will be merged if both their breakpoints can be merged. - max_range: See max_range field of cluster_loci. - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1, and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so breakpoint clusters), of the first breakpoint of an SV. SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the output. Return value: The output BEDPE. """ if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i in range(bedpe_df.shape[0]): breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1], bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1))) breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4], bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = {} for i in range(bedpe_df.shape[0]): name = bedpe_df.iloc[i]['name'] cluster_idx1 = mem_to_cluster[(name, 1)] cluster_idx2 = mem_to_cluster[(name, 2)] if not (cluster_idx1, cluster_idx2) in cluster_pairs: cluster_pairs[(cluster_idx1, cluster_idx2)] = [i] else: old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0] # Make sure the old and the new pair have breaks on the same chromosomes assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0]) assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3]) cluster_pairs[(cluster_idx1, cluster_idx2)].append(i) new_cluster_pairs = {} cluster_dist_ratio = {} for p, pos_list in cluster_pairs.iteritems(): pos_arr = np.array(pos_list) tmp_df = get_dataframe_loc(bedpe_df, pos_arr) quals = np.array(tmp_df.qual) best_call = pos_arr[np.argmax(quals)] close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0] close_df = get_dataframe_loc(tmp_df, close_calls) same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[ best_call]['chrom1'] min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1) max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[ best_call]['stop1'] new_cluster_pairs[p] = best_call if not same_chrom or max_break_dist > MAX_FRAG_SIZE: cluster_dist_ratio[p] = '.' elif min_break_dist <= 0: cluster_dist_ratio[p] = float('NaN') else: cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist cluster_pairs = new_cluster_pairs def clusters_close(i, j): chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[ i, 1], bedpe_df.iloc[i, 2] chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[ i, 4], bedpe_df.iloc[i, 5] next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[ j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2] next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[ j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5] dist1 = max(next_start1 - stop1, start1 - next_stop1) dist2 = max(next_start2 - stop2, start2 - next_stop2) return (chrom1 == next_chrom1 and chrom2 == next_chrom2 and dist1 <= merge_win and dist2 <= merge_win) # The "chain-breaking" in cluster_loci might still leave some redundancy. # In particular, we might leave some almost touching clusters that were # separated only because of chain-breaking. Do a second round of clustering # where you go through consecutive pairs of cluster and merge them if they're merge-able. new_cluster_pairs = {} for (cluster1, cluster2) in sorted(cluster_pairs.keys()): if cluster_pairs[(cluster1, cluster2)] == -1: continue # Consider all neighboring clusters after this cluster. # Notice that the cluster indices are sorted by genomic coordinates. neigh_clusters = [ (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1), (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1) ] idx = cluster_pairs[(cluster1, cluster2)] # Best cluster among neighboring clusters max_cluster = ((cluster1, cluster2), idx) for next_cluster1, next_cluster2 in neigh_clusters: if not (next_cluster1, next_cluster2) in cluster_pairs: continue if cluster_pairs[(next_cluster1, next_cluster2)] == -1: continue next_idx = cluster_pairs[(next_cluster1, next_cluster2)] if clusters_close(idx, next_idx): cluster_pairs[(next_cluster1, next_cluster2)] = -1 if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']: max_cluster = ((next_cluster1, next_cluster2), next_idx) new_cluster_pairs[max_cluster[0]] = max_cluster[1] cluster_pairs = new_cluster_pairs # Now compute the number of mate breakpoints for each cluster num_mates = {} for (cluster1, cluster2) in cluster_pairs.keys(): if not cluster1 in num_mates: num_mates[cluster1] = 0 if not cluster2 in num_mates: num_mates[cluster2] = 0 num_mates[cluster1] += 1 if cluster2 != cluster1: num_mates[cluster2] += 1 sel_loc = [] new_info_strs = [] for (cluster1, cluster2) in sorted(cluster_pairs.keys()): sv_loc = cluster_pairs[(cluster1, cluster2)] if num_mates[cluster1] > max_nmates and num_mates[ cluster2] > max_nmates: continue sel_loc.append(sv_loc) new_info_strs.append( tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'], ['NMATES1', 'NMATES2', 'RESOLUTION'], [ num_mates[cluster1], num_mates[cluster2], cluster_dist_ratio[(cluster1, cluster2)] ])) if len(sel_loc) > 0: bedpe_df = bedpe_df.iloc[sel_loc] bedpe_df['info'] = new_info_strs else: bedpe_df = pd.DataFrame(columns=bedpe_df.columns) if not out_bedpe is None: tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe) return bedpe_df
def summarize_barcode_data(misc_sm, qual_sms, barcode_whitelist): processed_bc_counts = misc_sm.get_summarizer('processed_bc_counts').dict metrics = misc_sm.get_summarizer('metrics') raw_bc_counts = misc_sm.get_summarizer('raw_bc_counts').dict # Compute high-level stats on the barcodes. Only emit bc metrics if we have a whitelist and have barcodes # attached if barcode_whitelist and len(raw_bc_counts) > 0: # What fraction of the whitelist did we see? # Don't trust that the whitelist used during attach_bcs is the same as the one used here # Must intersect with the raw_bc sequences. The processed_bc keys have the GEM group prepended. observed_good = set( raw_bc_counts.keys()).intersection(barcode_whitelist) metrics['fraction_bcs_observed'] = \ min(1.0, tk_stats.robust_divide(float(len(observed_good)), len(barcode_whitelist))) # What fraction of clusters had a correct barcode total_good_bc_observations = float(sum(processed_bc_counts.values())) metrics['correct_bc_rate'] = \ tk_stats.robust_divide(total_good_bc_observations, (total_good_bc_observations + metrics['bad_bc_count'])) # 'Effective diversity' of barcodes sum_sq = sum((v**2 for v in processed_bc_counts.values())) effective_diversity = tk_stats.robust_divide( (total_good_bc_observations**2.0), float(sum_sq)) metrics['effective_diversity'] = effective_diversity def fraction_within_f(counts, f): med_counts = np.median(counts) counts_in_range = np.logical_and(counts >= med_counts / f, counts <= med_counts * f) return np.mean(counts_in_range) # fraction of barcodes with abundance within 2x of median count_array = np.array(processed_bc_counts.values()) metrics['fraction_bc_within_2x_median'] = fraction_within_f( count_array, 2.0) metrics['fraction_bc_within_1.15x_median'] = fraction_within_f( count_array, 1.15) metrics['barcode_count_cv'] = np.std(count_array) / np.mean( count_array) raw_count_vect = list(raw_bc_counts.items()) raw_bc_list = [x[0] for x in raw_count_vect] raw_bc_count = np.array([x[1] for x in raw_count_vect], dtype=np.int32) is_valid_bc = [bc in barcode_whitelist for bc in raw_bc_list] bc_table_cols = { 'bc_sequence': raw_bc_list, 'count': raw_bc_count, 'valid_bc': is_valid_bc } bc_table = p.DataFrame(bc_table_cols) else: metrics['fraction_bcs_observed'] = None metrics['correct_bc_rate'] = None metrics['effective_diversity'] = None metrics['fraction_bc_within_2x_median'] = None dummy_bc_table = { 'bc_sequence': np.array([], dtype=np.object), 'count': np.array([], dtype=np.int32), 'valid_bc': np.array([], dtype=np.bool) } for minqv in BC_QUAL_CUTOFFS: name = "mapped_minqv_%d_count" % minqv dummy_bc_table[name] = np.array([], dtype=np.int32) name = "unmapped_minqv_%d_count" % minqv dummy_bc_table[name] = np.array([], dtype=np.int32) bc_table = p.DataFrame(dummy_bc_table) return bc_table
def main(args, outs): reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] ext_cols = list(sel_cols) ext_cols.append('total_cov') out_loci = [] summary_df = None for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov = reader.query((chrom, start, stop)) cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int) cov['total_cov'] = cov[sel_cols].sum(axis=1) mean_cov = np.mean(cov['total_cov']) summary_df = pd.concat([ summary_df, pd.DataFrame( { 'chrom': chrom, 'start': start, 'stop': stop, 'mean_cov': mean_cov }, index=[0]) ], ignore_index=True) # Remove very small phase sets. These tend to be single-SNP phase sets # and can result from erroneous SNPs. cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000) sum_df = cov.groupby(['bin', 'phase_set'])[ext_cols].mean().reset_index() sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov sum_df['low_hap0'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1) sum_df['low_hap1'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0) if not sum_df.empty: any_low = np.logical_or( sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0)) bins = np.array(sum_df['bin']) bins = np.concatenate([bins, [np.max(bins) + 1]]) pos = 0 # Get runs of 0s and 1s in any_low for bit, group in groupby(any_low): group_size = len(list(group)) group_start = bins[pos] * args.bin_size group_stop = bins[pos + group_size] * args.bin_size region_len = group_stop - group_start if bit and region_len >= args.min_len: out_loci.append((chrom, max(0, group_start - args.bin_size), group_start + args.bin_size, chrom, max(0, group_stop - args.bin_size), group_stop + args.bin_size)) pos += group_size with open(outs.loci, 'w') as f: cPickle.dump(out_loci, f) summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
def join(args, outs, chunk_defs, chunk_outs): pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs) # Change TRANS type to DISTAL. This change will only # affect the type reported not the names of the metrics. new_info = [] for _, row in pred_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'TRANS': sv_type = 'DISTAL' new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type])) pred_df['info'] = new_info if not true_df is None: true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') ##### Write BEDPE/VCF outputs tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates) source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format( martian.get_pipelines_version()) sample_id = 'sample' if args.sample_id is None else args.sample_id tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id, source_str, args.reference_path) # this will sort and gzip tk_sv_io.index_sv_vcf(outs.svs.strip(".gz")) outs.svs_index = outs.svs + '.tbi' # delete the non-gzipped file os.remove(outs.svs.strip('.gz')) if not pred_df.empty: call_df = pred_df[np.logical_or(pred_df['filters'] == '.', pred_df['filters'] == "PASS")] else: call_df = None tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls) # Annotate each call with the matching ground truth svs. The resulting # dataframe might have multiple rows for the same call if there were multiple # matching ground truth svs. martian.log_info("merging calls and gt") if not pred_df.empty: pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match) martian.log_info("writing call_tsv") pred_df.to_csv(outs.call_tsv, index=False, header=True, sep='\t', na_rep='NaN') pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))] max_dists = sorted(np.array(args.detect_dists)) gt_sv_types = get_all_sv_types(true_df) call_sv_types = get_all_sv_types(pred_df) if not true_df is None: # Use the default MAX_PPV_TIER unless this is greater than the maximum tier # present in the data. max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier)) # Use the default unless this is smaller than the minimum tier present in # the data. max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier)) else: max_ppv_tier = 1 max_sens_tier = 1 tiers = [max_ppv_tier, max_sens_tier] # All combinations of filters in ground truth and call set if not args.targets is None and not args.target_dists is None: target_dists = list(sorted(np.array(args.target_dists, dtype=np.float))) target_dists.append(float('NaN')) else: target_dists = [float('NaN')] combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers, [True, False], call_sv_types, max_dists) metrics = defaultdict(list) gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier'] call_filters = ['call_filtered', 'call_sv_type', 'match_dist'] for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type, dist) in combs: if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type: continue metrics['genic_breaks'].append(genic_breaks) metrics['target_dist'].append(tdist) metrics['gt_sv_type'].append(gt_sv_type) metrics['tier'].append(tier) metrics['call_filtered'].append(is_filtered) metrics['call_sv_type'].append(call_sv_type) metrics['match_dist'].append(dist) if true_df is None: sel_true_df = None else: sel_true_df = true_df if gt_sv_type != 'NA': sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type] if not np.isnan(tdist): sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist] sel_true_df = sel_true_df[sel_true_df.tier <= tier] # Restrict to genic or non-genic or take everything if this is None. if not genic_breaks is None: sel_true_df = sel_true_df[sel_true_df.genic_breaks == genic_breaks] if len(sel_true_df) == 0: sel_true_df = None sel_pred_df = pred_df if is_filtered and not pred_df.empty: sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') | (sel_pred_df.filters == 'PASS')] if call_sv_type != 'NA' and not pred_df.empty: sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type] if not pred_df.empty and (args.min_rel_overlap is None or args.min_rel_overlap == 0): # Do not apply thi filter if the matching is done based on overlap. sel_pred_df = sel_pred_df[np.logical_or( np.isnan(sel_pred_df.match_dist), sel_pred_df.match_dist <= dist)] add_metrics(sel_pred_df, sel_true_df, metrics) column_names = gt_filters column_names.extend(call_filters) other_names = set(metrics.keys()).difference(set(column_names)) column_names.extend(other_names) metric_df = pd.DataFrame(metrics) metric_df = metric_df[column_names] martian.log_info("writing summary tsv") metric_df.to_csv(outs.summary_tsv, index=False, header=True, sep='\t', na_rep='NaN') short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier, max_sens_tier, args) if not args.call_summary is None: with open(args.call_summary, 'r') as in_summary_fn: in_summary = json.load(in_summary_fn) for key, val in in_summary.iteritems(): short_metrics[key] = val short_metrics['min_qv'] = min_qv with open(outs.summary, 'w') as out_file: out_file.write( tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
def get_dataframe_loc(df, loc): if len(loc) > 0: return df.iloc[loc] else: return pd.DataFrame(columns=df.columns)
def main(args, outs): args.coerce_strings() outs.coerce_strings() if args.coverage is None or args.bait_file is None: outs.bait_csv = None return f = h5py.File(args.coverage) has_basic = ('coverage_deduped' in f) and ('mapq30_coverage_deduped' in f) has_subsampling = ('coverage_subsampled' in f) and ('coverage_deduped_subsampled' in f) f.close() if not has_basic: return fasta = tenkit.reference.open_reference(args.reference_path) df = p.DataFrame() coverage_reader = tenkit.hdf5.DataFrameReader(args.coverage) #regs = tenkit.bio_io.get_target_regions_dict(args.bait_file) #for chrom in regs: # for start, end in regs[chrom]: bedIte = tk_io.get_bed_iterator(args.bait_file) for chrom, start, end in bedIte: if has_subsampling: coverage = coverage_reader.query([chrom, start, end], query_cols=[ 'coverage_deduped', 'coverage_deduped_subsampled', 'coverage_subsampled', 'mapq30_coverage_deduped' ], coords=False) mean_cov = coverage.mean() gc = get_gc(chrom, (start, end), fasta) #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True) df = df.append( { 'chrom': chrom, 'start': start, 'end': end, 'tag': args.tag, 'coverage_deduped': mean_cov['coverage_deduped'], 'coverage_deduped_subsampled': mean_cov['coverage_deduped_subsampled'], 'coverage_subsampled': mean_cov['coverage_subsampled'], 'mapq30_coverage_deduped': mean_cov['mapq30_coverage_deduped'], 'gc': gc }, ignore_index=True) else: coverage = coverage_reader.query( [chrom, start, end], query_cols=['coverage_deduped', 'mapq30_coverage_deduped'], coords=False) mean_cov = coverage.mean() gc = get_gc(chrom, (start, end), fasta) #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True) df = df.append( { 'chrom': chrom, 'start': start, 'end': end, 'tag': args.tag, 'coverage_deduped': mean_cov['coverage_deduped'], 'mapq30_coverage_deduped': mean_cov['mapq30_coverage_deduped'], 'gc': gc }, ignore_index=True) df.to_csv(outs.bait_csv)
def get_frags_from_reads(in_bam, chrom1, start1, stop1, chrom2, start2, stop2, min_mapq=60, min_sv_len=45000, min_reads_per_frag=MIN_READS_PER_FRAG_TARGET, min_frag_size=MIN_FRAG_SIZE_TARGET): """Reconstruct molecules around two loci.""" if chrom1 == chrom2 and start2 - stop1 < MAX_FRAG_SIZE: # Hard case: The two loci are close enough that we could have molecules # spanning them. frag_starts = [] frag_stops = [] frag_reads = [] frag_bcs = [] reads = get_reads(in_bam, chrom1, max(0, start1 - FRAG_EXTEND), stop2 + FRAG_EXTEND, min_mapq=min_mapq).groupby('bc') for bc, group in reads: poses = np.array(group.pos) # Split positions into groups separated by a gap > min_sv_len pos_diff = np.where(np.diff(poses) > min_sv_len)[0] new_starts = np.concatenate([np.array([0]), pos_diff + 1]) new_stops = np.concatenate([pos_diff, np.array([len(poses) - 1])]) frag_starts.extend(poses[new_starts]) frag_stops.extend(poses[new_stops]) frag_reads.extend(new_stops - new_starts + 1) frag_bcs.extend(repeat(bc, len(pos_diff) + 1)) frags = pd.DataFrame({ 'bc': frag_bcs, 'start_pos': frag_starts, 'end_pos': frag_stops, 'num_reads': frag_reads }) # Remove spanning fragments frags = frags[(frags.start_pos > stop1) | (frags.end_pos < start2)] frags1 = frags.copy() frags2 = frags.copy() else: reads1 = get_reads(in_bam, chrom1, max(0, start1 - FRAG_EXTEND), stop1 + FRAG_EXTEND, min_mapq=min_mapq) reads2 = get_reads(in_bam, chrom2, max(0, start2 - FRAG_EXTEND), stop2 + FRAG_EXTEND, min_mapq=min_mapq) frags1 = reads1.groupby('bc').agg(['min', 'max', 'count'])['pos'].reset_index() frags2 = reads2.groupby('bc').agg(['min', 'max', 'count'])['pos'].reset_index() frags1.columns = ['bc', 'start_pos', 'end_pos', 'num_reads'] frags2.columns = ['bc', 'start_pos', 'end_pos', 'num_reads'] frags1 = frags1[(frags1.end_pos > start1) & (frags1.start_pos < stop1) & \ (frags1.num_reads > min_reads_per_frag) & \ (frags1.end_pos - frags1.start_pos > min_frag_size)] frags2 = frags2[(frags2.end_pos > start2) & (frags2.start_pos < stop2) & \ (frags2.num_reads > min_reads_per_frag) & \ (frags2.end_pos - frags2.start_pos > min_frag_size)] frags1['chrom'] = chrom1 frags2['chrom'] = chrom2 return frags1, frags2
def join(args, outs, chunk_defs, chunk_outs): summary = {} # Compute high-level BC summary metrics # Load BC data if args.barcodes: bc_df = tenkit.hdf5.read_data_frame(args.barcodes) fragment_df = tenkit.hdf5.read_data_frame(args.fragments, query_cols=['bc', 'chrom', 'start_pos']) bc_df.sort('bc_num_reads', inplace=True) bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) # Measure coalescence rate on all BCs that could conceivably be used # to call SVs - i.e. ignore BCs that contribute the cumulative bottom 1% of reads n99_read_thresh = sum(bc_df.bc_num_reads) * 0.01 n99_bcs = bc_df[bc_df.cum_reads > n99_read_thresh] martian.log_info("number of bcs to screen for coalescence: %d" % len(n99_bcs)) martian.log_info("subsetting fragments to use") if len(n99_bcs) > 1: selected_frags = fragment_df[fragment_df.bc.isin(n99_bcs.bc)] del fragment_df martian.log_info("Doing coalescence calculation") coa_calc = coalescence.BcSimilarity(selected_frags, set(n99_bcs.bc), args.input) coa_bc_tbl = coa_calc.coalescence_analysis() # Also add barcodes that are extreme outliers in the number of fragments observed med_frags_per_bc = n99_bcs.bc_num_fragments.median() high_quantile = n99_bcs.bc_num_fragments.quantile(0.98) bc_num_fragments_threshold = max(med_frags_per_bc*5.0, high_quantile) med_reads_per_bc = n99_bcs.bc_num_reads.median() high_quantile = n99_bcs.bc_num_reads.quantile(0.98) bc_num_reads_threshold = max(med_reads_per_bc*5.0, high_quantile) overloaded_bcs = n99_bcs[(n99_bcs.bc_num_fragments > bc_num_fragments_threshold) | (n99_bcs.bc_num_reads > bc_num_reads_threshold)] summary['fract_bcs_overloaded'] = float(len(overloaded_bcs)) / len(n99_bcs) # Remove bcs that are already in the blacklist nr_overloaded_bcs = overloaded_bcs[~overloaded_bcs.bc.isin(coa_bc_tbl.bc)] # Add overloaded bcs to blacklist overloaded_bc_tbl = p.DataFrame({'bc': nr_overloaded_bcs.bc, 'cluster_id': -1, 'cluster_size': -1}) # Write barcode blacklist bad_bc_tbl = p.concat([coa_bc_tbl, overloaded_bc_tbl]) bad_bc_tbl.to_csv(outs.barcode_blacklist, sep="\t", index=False) # Compute coalescence stats summary['fract_bcs_in_clusters_all'] = float(len(coa_bc_tbl)) / len(n99_bcs) summary['fract_bcs_in_clusters_eq_2'] = float((coa_bc_tbl.cluster_size == 2).sum()) / len(n99_bcs) summary['fract_bcs_in_clusters_gt_2'] = float((coa_bc_tbl.cluster_size > 2).sum()) / len(n99_bcs) summary['num_clusters_gt_8'] = (coa_bc_tbl.cluster_size > 8).sum() # Compute stats ignoring clusters of Hamming distance 2 hd2_clusters = [] for cluster in coa_bc_tbl.groupby('cluster_id'): if all_within_hamming_distance(cluster[1].bc.values, 2): hd2_clusters.append(cluster[0]) coa_tbl_no_hd2 = coa_bc_tbl[~coa_bc_tbl.cluster_id.isin(hd2_clusters)] summary['fract_bcs_in_clusters_all_no_hd2'] = float(len(coa_tbl_no_hd2)) / len(n99_bcs) summary['fract_bcs_in_clusters_eq_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size == 2).sum()) / len(n99_bcs) summary['fract_bcs_in_clusters_gt_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size > 2).sum()) / len(n99_bcs) else: empty_df = p.DataFrame({'bc':[], 'cluster_id':[], 'cluster_size':[]}) empty_df.to_csv(outs.barcode_blacklist, sep="\t", index=False) # null coalescence stats summary['fract_bcs_overloaded'] = None summary['fract_bcs_in_clusters_all'] = None summary['fract_bcs_in_clusters_eq_2'] = None summary['fract_bcs_in_clusters_gt_2'] = None summary['num_clusters_gt_8'] = None summary['fract_bcs_in_clusters_all_no_hd2'] = None summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None else: outs.barcode_blacklist = None summary['fract_bcs_overloaded'] = None summary['fract_bcs_in_clusters_all'] = None summary['fract_bcs_in_clusters_eq_2'] = None summary['fract_bcs_in_clusters_gt_2'] = None summary['num_clusters_gt_8'] = None summary['fract_bcs_in_clusters_all_no_hd2'] = None summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None # Write summary to json with open(outs.filter_barcodes_results, 'w') as results_file: tenkit.safe_json.dump_numpy(summary, results_file, pretty=True)
def get_stats_at_breaks(self, breakpoints, win_left, win_right, min_reads = 100, min_mapq = 60, method = BINOM_EMP_BC_COUNT_BC_FREQ, outward_only = False): columns = ['chrom1', 'start1', 'stop1', 'extStart1', 'extStop1', 'chrom2', 'start2', 'stop2', 'extStart2', 'extStop2', 'bcOv', 'nbcs1', 'nbcs2', 'readOv', 'nreads1', 'nreads2', 'binomQual', 'qual', 'bcs', 'bcFreqs'] stat_df = pd.DataFrame(columns = columns, index = np.arange(len(breakpoints))) in_bam = tk_bam.create_bam_infile(self.bam) nbcs = len(self.bc_map) for bidx, breakpoint in enumerate(breakpoints): chrom1, start1, stop1, chrom2, start2, stop2 = breakpoint[0:6] if (chrom1, start1, stop1) > (chrom2, start2, stop2): chrom1, start1, stop1, chrom2, start2, stop2 = chrom2, start2, stop2, chrom1, start1, stop1 bc1_counts, ext_start1, ext_stop1 = self.get_bcs_around_break(in_bam, chrom1, start1, stop1, win_left, (0 if outward_only else win_right), min_reads = min_reads, min_mapq = min_mapq) bc2_counts, ext_start2, ext_stop2 = self.get_bcs_around_break(in_bam, chrom2, start2, stop2, (0 if outward_only else win_left), win_right, min_reads = min_reads, min_mapq = min_mapq) bc_ov = set(bc1_counts.keys()).intersection(set(bc2_counts.keys())) nbcs1, nbcs2 = len(bc1_counts), len(bc2_counts) read_list1 = [] for b, reads in bc1_counts.iteritems(): read_list1.extend(reads) read_list1 = set(read_list1) read_list2 = [] for b, reads in bc2_counts.iteritems(): read_list2.extend(reads) read_list2 = set(read_list2) nreads1 = len(read_list1) nreads2 = len(read_list2) read_ov = len(read_list1.intersection(read_list2)) if len(bc_ov) > 0: bc_idx = np.array([self.bc_map[b] for b in bc_ov]).flatten() win_idx = np.zeros(bc_idx.shape, dtype = np.int) if method == BINOM_EMP_BC_COUNT_BC_FREQ or method == 5: pov = log10_emp_pval(win_idx, bc_idx, max(nbcs1, nbcs2), self.bc_freq) elif method == BINOM: pov = log10_binom_pval(len(bc_ov), nbcs1, nbcs2, nbcs) else: martian.throw('Unsupported method for quality computation.') qual = pval_to_qual(pov) binom_qual = pval_to_qual(log10_binom_pval(len(bc_ov), nbcs1, nbcs2, nbcs)) else: qual = 0 binom_qual = 0 bcs = ','.join(list(bc_ov)) bc_freqs = ','.join(['{:.2f}'.format(-np.log10(self.bc_freq[self.bc_map[b]])) for b in bc_ov]) stat_df.loc[bidx] = [chrom1, int(start1), int(stop1), ext_start1, ext_stop1, chrom2, int(start2), int(stop2), ext_start2, ext_stop2, len(bc_ov), nbcs1, nbcs2, read_ov, nreads1, nreads2, binom_qual, qual, bcs, bc_freqs] in_bam.close() return stat_df