def get_hap_coverage(in_bam, ps_h5, chrom, start, stop, cov_quals): """Return a dataframe with coverage per haplotype. Args: - in_bam: reader for a position sorted bam - ps_h5: HDF5 with phase set coordinates - chrom, start, stop: region to get coverage - cov_quals: Array of MAPQ cutoffs. Return value: A dataframe with columns: - chrom - pos - cov_q<M>_hap<H> for all M in cov_quals and for H in [0, 1, 2]: This is the coverage on haplotype H using reads of MAPQ >= M. Haplotype 2 corresponds to unphased. - phase_set: null if ps_h5 is missing. """ coverages = [np.zeros((stop - start, 3)) for _ in cov_quals] for _, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))): if not read.is_unmapped and not read.aend is None and not read.is_secondary and not read.is_duplicate: hap = tk_io.get_read_haplotype(read) hap_idx = 2 if hap is None else hap - 1 range_start = max(0, read.pos - start) range_stop = min(stop, read.aend) - start for qi, q in enumerate(cov_quals): if read.mapq >= q: coverages[qi][range_start:range_stop + 1, hap_idx] += 1 base_df = pd.DataFrame({'chrom': chrom, 'pos': np.arange(start, stop)}) dfs = map( lambda x: pd.DataFrame( x[0], columns=['cov_q' + str(x[1]) + '_hap' + str(i) for i in range(3)]), zip(coverages, cov_quals)) df = pd.concat([base_df, pd.concat(dfs, axis=1)], axis=1) phase_sets = -np.ones((stop - start, ), dtype=np.int) # This can be None if for example the input is unbarcoded. if not ps_h5 is None: ps_df = tk_hdf5.read_data_frame(ps_h5) ps_df = ps_df[np.logical_and( ps_df.chrom == chrom, np.logical_and(ps_df.end >= start, ps_df.start < stop))] for _, row in ps_df.iterrows(): range_start = max(0, row.start - start) range_stop = min(stop, row.end) - start phase_sets[range_start:range_stop + 1] = row.phase_set df['phase_set'] = phase_sets return df
def join(args, outs, chunk_defs, chunk_outs): out_calls = None out_pileups = None for c in chunk_outs: if not os.path.isfile(c.sv_calls): continue calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls) pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups) out_calls = pd.concat([out_calls, calls], ignore_index=True) out_pileups = pd.concat([out_pileups, pileups], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
def join(args, outs, chunk_defs, chunk_outs): join_df = None non_pass_join_df = None for chunk in chunk_outs: df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls) join_df = pd.concat([join_df, df], ignore_index=True) non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df], ignore_index=True) join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) non_pass_join_df['name'] = np.arange(len(join_df), len(join_df) + len(non_pass_join_df)) tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
def join(args, outs, chunk_defs, chunk_outs): out_df = None for chunk in chunk_outs: tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates) out_df = pd.concat([out_df, tmp_df], ignore_index=True) out_df['name'] = np.arange(len(out_df)) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
def read_bedpes(args): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) if not args.sv_calls2 is None: sv_df = pd.concat( [sv_df, tk_sv_io.read_sv_bedpe_to_df(args.sv_calls2)], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) return sv_df
def make_output_dataframes(bcs_frags_in): fragments = [] bcs = [] bc_dfs = [] fragment_dfs = [] for (bc_stats, frags) in bcs_frags_in: # Denormalize selected bc columns into the fragments dataframe for (k, v) in bc_stats.items(): if k in [ 'bc', 'bc_num_reads', 'bc_mean_reads_per_fragment', 'bc_est_len', 'bc_num_unmapped_reads' ]: for frag in frags: frag[k] = v fragments.extend(frags) bcs.append(bc_stats) if len(fragments) > 2e6: (frag_df, bc_df) = make_df_chunk(fragments, bcs) fragment_dfs.append(frag_df) fragments = [] bc_dfs.append(bc_df) bcs = [] (frag_df, bc_df) = make_df_chunk(fragments, bcs) fragment_dfs.append(frag_df) bc_dfs.append(bc_df) frag_dfs = [x for x in fragment_dfs if x is not None] bc_dfs = [x for x in bc_dfs if x is not None] if len(bc_dfs) > 0: frag_df = p.concat(frag_dfs) bc_df = p.concat(bc_dfs) else: frag_df = None bc_df = None return (frag_df, bc_df)
def join(args, outs, chunk_defs, chunk_outs): join_df = None for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants) join_df = pd.concat([join_df, bedpe_df], ignore_index=True) if not args.best_only: join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
def main(args, outs): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) sv_df["info2"] = "SV" cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants) cnv_df["info2"] = "CNV" sv_df = pd.concat([sv_df, cnv_df], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) sv_df.sort(['chrom1', 'chrom2'], inplace=True) res_df = None for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']): tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], inplace=True) # cluster the loci in the group based on proximity groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist) # for each cluster, get the row with max qual # tmp_df.loc[g] gets the subset of tmp_df in the cluster. # then idxmax gets the max index out_df = pd.DataFrame(columns=sv_df.columns) idx = 0 for g in groups: row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()] if (tmp_df.loc[g]['info2'] == 'SV').any(): row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()] source = list(set(tmp_df.loc[g]['info2'])) row['info'] += (";SOURCE=" + ",".join(source)) out_df.loc[idx] = row idx += 1 out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True) res_df = pd.concat([res_df, out_df], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
def get_reads(in_bam, chrom, start, stop, in_read_df=None, min_mapq=30, max_reads=500000, blacklist_barcodes=None): poses = [] ends = [] bcs = [] if not in_read_df is None and len(in_read_df) > 0: ret_df = in_read_df.sort('pos') old_poses = np.array(ret_df['pos']) # Subtracting the read length is roughly right, ideally we should sort # by aend. # Loci are considered in an ordered fashion, so we should never fetch # reads "earlier" in the bam. start = max(old_poses[0], max(0, start - MAX_READ_LEN)) if start >= old_poses[0] and start <= old_poses[-1]: start_idx = bisect.bisect_left(old_poses, start) if stop >= old_poses[0] and stop <= old_poses[-1]: stop_idx = min(len(ret_df), bisect.bisect(old_poses, stop)) else: stop_idx = len(ret_df) # Remove all positions that are smaller than the input start ret_df = ret_df.iloc[start_idx:stop_idx] # Set the new start to the end of the input data frame. # Add an overlap of READ_LEN to capture reads that were right on # the boundary between the old and new data frame. start = max(0, old_poses[stop_idx - 1] - MAX_READ_LEN) stop = max(start, stop) else: ret_df = None for i, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))): if i > max_reads: break bc = tk_io.get_read_barcode(read) if read.pos < start: continue if not blacklist_barcodes is None and bc in blacklist_barcodes: continue if not read.is_secondary and not read.is_duplicate and read.is_read1 and \ not read.is_unmapped and read.mapq >= min_mapq and read.is_proper_pair and \ not bc is None: poses.append(read.pos) ends.append(read.aend) bcs.append(tk_io.get_read_barcode(read)) tmp_ret_df = pd.DataFrame({'chrom':chrom, 'pos':poses, 'aend':ends, 'bc':bcs}) ret_df = pd.concat([ret_df, tmp_ret_df], ignore_index=True) ret_df.sort(['bc', 'pos'], inplace=True) return ret_df
def merge_calls_and_gt(call_df, gt_df, call_to_gt): if not gt_df is None: gt_df.index = gt_df['name'] else: call_to_gt = {} out_call_df = None for _, row in call_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) orient = tk_sv_io.get_break_orientation(row.info) row['orient'] = orient # revert sv type name from DISTAL to TRANS to match ground truth # conventions if sv_type == 'DISTAL': sv_type = 'TRANS' row['sv_type'] = sv_type matches = list(call_to_gt.get(row['name'], [None])) # One output row per match for m in matches: row['match'] = m if not m is None and not gt_df is None: x = gt_df.loc[m] row['match_dist'] = max( dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1, x.stop1), dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2, x.stop2)) else: row['match_dist'] = float('NaN') out_call_df = pd.concat( [out_call_df, pd.DataFrame([row])], ignore_index=True) if not gt_df is None: out_call_df = pd.merge(out_call_df, gt_df, left_on='match', right_on='name', how='outer', suffixes=['', '_gt']) out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True) out_call_df.sort('name', inplace=True) return out_call_df
def join(args, outs, chunk_defs, chunk_outs): # Combine the coverage hdf5 files frame = p.DataFrame() list_ = [] if args.baits_file_map and outs.bait_csv: in_files = [ out.bait_csv for (cdef, out) in zip(chunk_defs, chunk_outs) ] for file_ in in_files: df = p.read_csv(file_, index_col=None, header=0) list_.append(df) frame = p.concat(list_) # write csv frame.to_csv(outs.bait_csv) else: outs.target_coverage = None
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if not out_bedpe is None: out_bedpe['name'] = np.arange(len(out_bedpe)) sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants) if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary): shutil.copyfile(chunk_outs[0].summary, outs.summary) else: outs.summary = None
def join(args, outs, chunk_defs, chunk_outs): out_loci = [] summary_df = None for chunk in chunk_outs: with open(chunk.loci, 'r') as f: out_loci.extend(cPickle.load(f)) summary_df = pd.concat([ summary_df, pd.read_csv(chunk.cov_summary, sep='\t', header=0, index_col=None) ], ignore_index=True) # There might be some overlapping loci due to the overlap between chunks # but we hope that subsequent stages will deal with this. with open(outs.loci, 'w') as f: cPickle.dump(out_loci, f) summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
def merge_multiple_breaks(in_bedpes, out_bedpe, merge_win=10000, max_range=np.inf): assert (len(in_bedpes) > 0) in_bedpe_df = None for bi, bedpe in enumerate(in_bedpes): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe) assert (bedpe_df.shape[1] > 11) bedpe_df = bedpe_df.iloc[:, 0:12] # Make sure that all names from all files are unique bedpe_df['name'] = [str(n) + '_' + str(bi) for n in bedpe_df['name']] in_bedpe_df = pd.concat([in_bedpe_df, bedpe_df], ignore_index=True) return merge_breaks(in_bedpe_df, out_bedpe, merge_win=merge_win, max_range=max_range)
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if out_bedpe is None: col_names = ['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name', 'qual', 'strand1', 'strand2', 'filters', 'info'] out_bedpe = pd.DataFrame(columns=col_names) out_bedpe.names = np.arange(len(out_bedpe)) out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv] tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
def read_data_frame_filtered(fn, filter_func, query_cols=[], chunk_size=5000000): ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns. filter_func should take a DataFrame and return a boolean vector of the rows to keep. Rows are loaded from the file and filtered in chunks to keep peak memory usage small. ''' f = h5py.File(fn, 'r') column_names = f.attrs.get("column_names") column_names = get_column_intersection(column_names, query_cols) column_index = p.Index(column_names) sz = f[column_names[0]].shape[0] starts = np.arange(0, sz, chunk_size) ends = np.minimum(sz, starts + chunk_size) chunks = [] for (start, end) in zip(starts, ends): cols = {} for name in column_names: ds = f[name] if has_levels(ds): indices = ds[start:end] uniques = get_levels(ds) col = uniques[indices] else: col = ds[start:end] cols[name] = col df = p.DataFrame(cols, columns=column_index) df = df[filter_func(df)] if len(df) > 0 or len(chunks) == 0: chunks.append(df) f.close() result = p.concat(chunks, ignore_index=True) return result
def merge_overlapping(callsets, selection_fun=select_first()): """Merge overlapping calls and remove redundancies based on the specified function. - callsets: list of call dataframes - selection_fun: given a group of overlapping calls, this function decides which of these calls should be output. Default is to pick the first in each group. """ # Make sure there are no inter-chromosomal calls assert np.all(not has_inter_chromosomal(callset) for callset in callsets) callset = pd.concat(callsets, ignore_index=True) # The selection functions might break if this is empty if callset.empty: return None callset.sort(['chrom1', 'start1', 'start2'], inplace=True) groups = group_overlapping(callset) # agg returns a multilevel dataframe. reset_index removes one level and # adds a group column, which we drop. return groups.agg(selection_fun).reset_index().drop('group', axis=1)
def read_data_frame_indexed(fn, queries, query_cols=[], coords=True): ''' Read rows from the HDF5 data frame that match each tabix query in the queries list. A tabix query is in the form ('chr1', 100, 200). query_cols is a list of columns you want to return. If coords is True, then it it will return coordinates regardless of query_cols. If coords is False, it will only return the columns specified in query_cols. Returns a concatenated pandas DataFrame. WARNING: If the queries overlap in coordinates, the same region will appear more than once. In such cases, use read_data_frame_indexed_no_concat().''' dfs = read_data_frame_indexed_no_concat(fn, queries, query_cols, coords) if len(dfs) == 1: d = dfs[0] else: # Return the union of the queries d = p.concat(dfs) d.reset_index(drop=True, inplace=True) return d
def merge_predictions(chunk_outs): join_pred_to_match = {} join_true_to_match = defaultdict(set) join_pred_df = None feasible_gt = None for ci, chunk in enumerate(chunk_outs): with open(re.sub('.json', '.pickle', chunk.summary)) as f: pred_to_match = cPickle.load(f) true_to_match = cPickle.load(f) pred_df, min_qv = cPickle.load(f) join_pred_df = pd.concat([join_pred_df, pred_df], ignore_index=True) for pred, matches in pred_to_match.iteritems(): join_pred_to_match[pred] = matches for sv, matches in true_to_match.iteritems(): join_true_to_match[sv] = join_true_to_match[sv].union(matches) if ci == 0 and os.path.exists(chunk.feasible_gt): feasible_gt = pd.read_csv(chunk.feasible_gt, header=0, index_col=None, sep='\t') return (join_pred_to_match, join_true_to_match, join_pred_df, feasible_gt, min_qv)
def join(args, outs, chunk_defs, chunk_outs): join_df = None read_counts = {} read_counts['split'] = defaultdict(int) read_counts['pair'] = defaultdict(int) for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) join_df = pd.concat([join_df, bedpe_df], ignore_index = True) if not os.path.isfile(chunk.discordant_read_counts): continue with open(chunk.discordant_read_counts, 'r') as f: counts = json.load(f) for t, c in counts['split'].iteritems(): read_counts['split'][t] += c for t, c in counts['pair'].iteritems(): read_counts['pair'][t] += c join_df['name'] = [str(i) for i in np.arange(len(join_df))] tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) read_counts['split'] = dict(read_counts['split']) read_counts['pair'] = dict(read_counts['pair']) with open(args.basic_summary, 'r') as f: num_reads = float(json.load(f)['num_reads']) / 2.0 read_counts['frac_split'] = {} read_counts['frac_pair'] = {} for t, c in read_counts['split'].iteritems(): read_counts['frac_split'][t] = c / num_reads for t, c in read_counts['pair'].iteritems(): read_counts['frac_pair'][t] = c / num_reads with open(outs.discordant_read_counts, 'w') as f: f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs): sv_df = read_sv_bedpe_to_df(args.gt_variants) sv_df = get_dataframe_loc(sv_df, list(range(args.start_idx, args.stop_idx))) if not isfile(args.fragments) or not isfile(args.fragment_histogram) or not isfile(args.barcodes) \ or not isfile(args.barcode_blacklist) or not isfile(args.coverage): sv_df['qual'] = 0 if 'info' in sv_df.columns: info_strs = [s for s in sv_df['info']] else: info_strs = ['.' for i in range(len(sv_df))] for i in range(len(info_strs)): info_strs[i] = update_info(info_strs[i], ['BCOV', 'NBCS1', 'NBCS2', 'NOOV'], [0, 0, 0, 0]) sv_df['info'] = info_strs sv_df['strand1'] = '.' sv_df['strand2'] = '.' sv_df['filters'] = '.' write_sv_df_to_bedpe(sv_df, outs.summary) martian.log_info( 'One or more files needed for computing quality scores are missing.' ) return input_bam = tk_bam.create_bam_infile(args.input) genome_size = np.sum(np.array(input_bam.lengths)) input_bam.close() frag_file = args.fragments frag_hist_file = args.fragment_histogram barcode_file = args.barcodes barcode_blacklist_file = args.barcode_blacklist if args.targets is None: target_coverage = None corr_factor = 1.0 min_frag_size = MIN_FRAG_SIZE_WGS min_reads_per_frag = MIN_READS_PER_FRAG_WGS link_distance = SV_FRAGMENT_LINK_DISTANCE_WGS else: target_regions = bed_to_region_map(args.targets, merge=True) target_coverage = region_cum_coverage_map(target_regions, TARGET_COV_BIN) with open(args.coverage, 'r') as f: cov_sum = json.load(f)['target_info'] if 'on_target_bases' in cov_sum: prob_off_target = 1 - cov_sum['on_target_bases'] / float( cov_sum['total_bases']) else: prob_off_target = 0.001 corr_factor = off_target_amp_corr_factor(target_regions, prob_off_target, genome_size=genome_size) min_frag_size = MIN_FRAG_SIZE_TARGET min_reads_per_frag = MIN_READS_PER_FRAG_TARGET link_distance = SV_FRAGMENT_LINK_DISTANCE_TARGET frag_sizes, frag_counts, frag_prc, blacklist_barcodes, frag_filter_fun = get_frag_data( frag_hist_file, barcode_file, barcode_blacklist_file, nx=args.nx, min_frag_size=min_frag_size, min_reads_per_frag=min_reads_per_frag) min_sv_len = link_distance prob_store = {} required_cols = [ 'bc', 'bc_num_reads', 'bc_est_len', 'bc_mean_reads_per_fragment', 'chrom', 'start_pos', 'end_pos', 'num_reads', 'obs_len', 'est_len' ] fragment_reader = tk_hdf5.DataFrameReader(frag_file) out_df = None for (chrom1, chrom2), g in sv_df.groupby(['chrom1', 'chrom2']): query1 = (chrom1, max(0, np.min(g['start1']) - MAX_FRAG_SIZE), np.max(g['stop1']) + MAX_FRAG_SIZE) filt = frag_filter_fun(*query1) frags1 = filt( fragment_reader.query(query1, query_cols=required_cols, id_column='fragment_id')) query2 = (chrom2, max(0, np.min(g['start2']) - MAX_FRAG_SIZE), np.max(g['stop2']) + MAX_FRAG_SIZE) filt = frag_filter_fun(*query2) frags2 = filt( fragment_reader.query(query2, query_cols=required_cols, id_column='fragment_id')) lrs = np.zeros((len(g), ), dtype=np.int) if 'info' in g.columns: info_strs = [s for s in g['info']] else: info_strs = ['.' for i in range(len(g))] for i, (s1, e1, s2, e2) in enumerate( zip(g['start1'], g['stop1'], g['start2'], g['stop2'])): if chrom1 == chrom2: middle = 0.5 * (e1 + s2) locus1 = (chrom1, max(0, s1 - args.extend_win), min(middle, e1 + args.extend_win)) locus2 = (chrom2, max(middle, s2 - args.extend_win), e2 + args.extend_win) else: locus1 = (chrom1, max(0, s1 - args.extend_win), e1 + args.extend_win) locus2 = (chrom2, max(0, s2 - args.extend_win), e2 + args.extend_win) tmp_frags1 = overlap_frags(max(0, s1 - args.extend_win), e1 + args.extend_win, frags1) tmp_frags2 = overlap_frags(max(0, s2 - args.extend_win), e2 + args.extend_win, frags2) (lr, pb, nov, ndiscordant, new_start, new_stop, _) = get_lr_from_frag_df(frags1, frags2, locus1, locus2, prob_store, frag_sizes, frag_counts, blacklist_barcodes, target_coverage, corr_factor, genome_size=genome_size, min_dist=min_sv_len) lrs[i] = lr if new_start[0] is None or new_start[1] is None: new_start = (s1, e1) if new_stop[0] is None or new_stop[1] is None: new_stop = (s2, e2) info_strs[i] = update_info( info_strs[i], ['BCOV', 'NBCS1', 'NBCS2', 'NOOV', 'P_SV', 'START', 'STOP'], [ nov, len(set(tmp_frags1.bc)), len(set(tmp_frags2.bc)), ndiscordant, pb, '-'.join([ str(p) for p in new_start ]), '-'.join([str(p) for p in new_stop]) ]) g_cp = g.copy() g_cp['qual'] = np.maximum(lrs, 0) g_cp['info'] = info_strs g_cp['strand1'] = '+' g_cp['strand2'] = '+' g_cp['filters'] = '.' out_df = pd.concat([out_df, g_cp]) write_sv_df_to_bedpe(out_df, outs.summary)
def join(args, outs, chunk_defs, chunk_outs): summary = {} # Compute high-level BC summary metrics # Load BC data if args.barcodes: bc_df = tenkit.hdf5.read_data_frame(args.barcodes) fragment_df = tenkit.hdf5.read_data_frame(args.fragments, query_cols=['bc', 'chrom', 'start_pos']) bc_df.sort('bc_num_reads', inplace=True) bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) # Measure coalescence rate on all BCs that could conceivably be used # to call SVs - i.e. ignore BCs that contribute the cumulative bottom 1% of reads n99_read_thresh = sum(bc_df.bc_num_reads) * 0.01 n99_bcs = bc_df[bc_df.cum_reads > n99_read_thresh] martian.log_info("number of bcs to screen for coalescence: %d" % len(n99_bcs)) martian.log_info("subsetting fragments to use") if len(n99_bcs) > 1: selected_frags = fragment_df[fragment_df.bc.isin(n99_bcs.bc)] del fragment_df martian.log_info("Doing coalescence calculation") coa_calc = coalescence.BcSimilarity(selected_frags, set(n99_bcs.bc), args.input) coa_bc_tbl = coa_calc.coalescence_analysis() # Also add barcodes that are extreme outliers in the number of fragments observed med_frags_per_bc = n99_bcs.bc_num_fragments.median() high_quantile = n99_bcs.bc_num_fragments.quantile(0.98) bc_num_fragments_threshold = max(med_frags_per_bc*5.0, high_quantile) med_reads_per_bc = n99_bcs.bc_num_reads.median() high_quantile = n99_bcs.bc_num_reads.quantile(0.98) bc_num_reads_threshold = max(med_reads_per_bc*5.0, high_quantile) overloaded_bcs = n99_bcs[(n99_bcs.bc_num_fragments > bc_num_fragments_threshold) | (n99_bcs.bc_num_reads > bc_num_reads_threshold)] summary['fract_bcs_overloaded'] = float(len(overloaded_bcs)) / len(n99_bcs) # Remove bcs that are already in the blacklist nr_overloaded_bcs = overloaded_bcs[~overloaded_bcs.bc.isin(coa_bc_tbl.bc)] # Add overloaded bcs to blacklist overloaded_bc_tbl = p.DataFrame({'bc': nr_overloaded_bcs.bc, 'cluster_id': -1, 'cluster_size': -1}) # Write barcode blacklist bad_bc_tbl = p.concat([coa_bc_tbl, overloaded_bc_tbl]) bad_bc_tbl.to_csv(outs.barcode_blacklist, sep="\t", index=False) # Compute coalescence stats summary['fract_bcs_in_clusters_all'] = float(len(coa_bc_tbl)) / len(n99_bcs) summary['fract_bcs_in_clusters_eq_2'] = float((coa_bc_tbl.cluster_size == 2).sum()) / len(n99_bcs) summary['fract_bcs_in_clusters_gt_2'] = float((coa_bc_tbl.cluster_size > 2).sum()) / len(n99_bcs) summary['num_clusters_gt_8'] = (coa_bc_tbl.cluster_size > 8).sum() # Compute stats ignoring clusters of Hamming distance 2 hd2_clusters = [] for cluster in coa_bc_tbl.groupby('cluster_id'): if all_within_hamming_distance(cluster[1].bc.values, 2): hd2_clusters.append(cluster[0]) coa_tbl_no_hd2 = coa_bc_tbl[~coa_bc_tbl.cluster_id.isin(hd2_clusters)] summary['fract_bcs_in_clusters_all_no_hd2'] = float(len(coa_tbl_no_hd2)) / len(n99_bcs) summary['fract_bcs_in_clusters_eq_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size == 2).sum()) / len(n99_bcs) summary['fract_bcs_in_clusters_gt_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size > 2).sum()) / len(n99_bcs) else: empty_df = p.DataFrame({'bc':[], 'cluster_id':[], 'cluster_size':[]}) empty_df.to_csv(outs.barcode_blacklist, sep="\t", index=False) # null coalescence stats summary['fract_bcs_overloaded'] = None summary['fract_bcs_in_clusters_all'] = None summary['fract_bcs_in_clusters_eq_2'] = None summary['fract_bcs_in_clusters_gt_2'] = None summary['num_clusters_gt_8'] = None summary['fract_bcs_in_clusters_all_no_hd2'] = None summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None else: outs.barcode_blacklist = None summary['fract_bcs_overloaded'] = None summary['fract_bcs_in_clusters_all'] = None summary['fract_bcs_in_clusters_eq_2'] = None summary['fract_bcs_in_clusters_gt_2'] = None summary['num_clusters_gt_8'] = None summary['fract_bcs_in_clusters_all_no_hd2'] = None summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None # Write summary to json with open(outs.filter_barcodes_results, 'w') as results_file: tenkit.safe_json.dump_numpy(summary, results_file, pretty=True)
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: in_bedpe = read_sv_bedpe_to_df(c.summary) out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) write_sv_df_to_bedpe(out_bedpe, outs.summary)
def call_distal(sv_model, c1, s1, e1, c2, s2, e2, in_bam, frag_phasing, blacklist_barcodes, args): """Evaluate a pair of loci (c1, s1, e1), (c2, s2, e2) for distal SV calls. Return value: a list of SvCall objects. """ # Set the random seed so if we run this for the same locus multiple times we # always get the same answer. seed(1) # Fetch reads from each of the two loci reads1 = get_reads(in_bam, c1, max(0, s1 - FRAG_EXTEND), e1 + FRAG_EXTEND, min_mapq=args.min_mapq, max_reads=MAX_READS_TO_READ, blacklist_barcodes=blacklist_barcodes) reads1 = reads1.groupby('bc').filter(lambda x: len(x) <= MAX_READS_PER_FRAG and \ len(x) >= MIN_READS_PER_FRAG / 2 and \ np.max(x.pos) - np.min(x.pos) > MIN_FRAG_SIZE / 2) reads2 = get_reads(in_bam, c2, max(0, s2 - FRAG_EXTEND), e2 + FRAG_EXTEND, min_mapq=args.min_mapq, max_reads=MAX_READS_TO_READ, blacklist_barcodes=blacklist_barcodes) reads2 = reads2.groupby('bc').filter(lambda x: len(x) <= MAX_READS_PER_FRAG and \ len(x) >= MIN_READS_PER_FRAG / 2 and \ np.max(x.pos) - np.min(x.pos) > MIN_FRAG_SIZE / 2) common_bcs = (set(reads1.bc).intersection(set(reads2.bc))) if len(common_bcs) < args.min_bcs: return None common_reads1 = reads1[np.array([bc in common_bcs for bc in reads1.bc], dtype=np.bool)] common_reads2 = reads2[np.array([bc in common_bcs for bc in reads2.bc], dtype=np.bool)] # Split each of the loci into small windows and count how many # molecules end or start in each region of the resulting grid. ticks1 = np.arange(np.min(common_reads1.pos), np.max(common_reads1.pos), args.grid_len) ticks2 = np.arange(np.min(common_reads2.pos), np.max(common_reads2.pos), args.grid_len) tick_counts = np.zeros((len(ticks1), len(ticks2)), dtype=np.int) nticks = len(ticks1) * len(ticks2) # barcode x cell, 1 if there is barcode overlap in the cell for that barcode bc_tick_counts = np.zeros((len(common_bcs), nticks), dtype=np.bool) common_reads1 = common_reads1.groupby('bc') common_reads2 = common_reads2.groupby('bc') common_bcs = list(common_bcs) for i, bc in enumerate(common_bcs): r1 = common_reads1.get_group(bc) r2 = common_reads2.get_group(bc) x = (np.array([np.min(r1.pos), np.max(r1.pos)], dtype=np.int) - ticks1[0]) / args.grid_len y = (np.array([np.min(r2.pos), np.max(r2.pos)], dtype=np.int) - ticks2[0]) / args.grid_len x = x[np.logical_and(x >= 0, x < len(ticks1))] y = y[np.logical_and(y >= 0, y < len(ticks2))] vals = list(set(product(x, y))) a = np.array([v[0] for v in vals], dtype=np.int) b = np.array([v[1] for v in vals], dtype=np.int) tick_counts[a, b] += 1 bc_tick_counts[i, np.ravel_multi_index((a, b), (len(ticks1), len(ticks2)))] = True rounds = 0 final_res = [] ps1 = sv_utils.get_phase_set(frag_phasing, c1, s1, e1) ps2 = sv_utils.get_phase_set(frag_phasing, c2, s2, e2) # Get the cell of the grid with the maximum barcode overlap, get the # barcodes/molecules involved, figure out the direction of the signal, # get a better estimate of the breakpoints, and compute the SV score at # these breakpoints. Then, subtract the contribution of the involved # barcodes from the barcode overlap grid and repeat. # In practice, we only do it once. while np.max(tick_counts) > 2 and len(common_reads1) > 2 and len(common_reads2) > 2 and rounds < 1: rounds += 1 # Get region of maximum barcode overlap. m is flat index m = np.argmax(tick_counts) x, y = np.unravel_index(m, (len(ticks1), len(ticks2))) new_start1 = (x * args.grid_len + ticks1[0]) new_start2 = (y * args.grid_len + ticks2[0]) # Get the fragments overlapping the region of max barcode overlap common_reads1 = common_reads1.filter(lambda x: not(np.max(x.pos) < new_start1 - 5000 or np.min(x.pos) > new_start1 + 5000)) common_reads2 = common_reads2.filter(lambda x: not(np.max(x.pos) < new_start2 - 5000 or np.min(x.pos) > new_start2 + 5000)) # Get fragments with common barcodes that overlap the selected cell # This is a superset of np.where(bc_tick_counts[:, m])[0] because it doesn't require that # the fragment starts or ends within the cell. new_common_bcs = set(common_reads1.bc).intersection(set(common_reads2.bc)) # Infer orientation based on ends of fragments sv_types, cand_breaks1, cand_breaks2 = get_orient(common_reads1.groupby('bc'), common_reads2.groupby('bc'), new_start1, new_start2, new_common_bcs, max_cand_breaks=args.max_cand_breaks) if len(cand_breaks1) == 0 or len(cand_breaks2) == 0 or len(sv_types) == 0: return None svt = sv_types[0] # Use the inferred orientation of the signal to remove barcodes that are # clearly irrelevant. tmp_read_groups1 = reads1.groupby('bc').filter(filter_irrelevant_frags1(svt, cand_breaks1)) tmp_read_groups1['break'] = 1 tmp_read_groups2 = reads2.groupby('bc').filter(filter_irrelevant_frags2(svt, cand_breaks2)) tmp_read_groups2['break'] = 2 read_groups = pd.concat([tmp_read_groups1, tmp_read_groups2], ignore_index=True) if len(read_groups) == 0: return None sel_bcs = set(read_groups.bc) if len(sel_bcs) > MAX_READ_GROUPS: sel_bcs = list(sel_bcs) sel_bcs = set([sel_bcs[i] for i in choice(len(sel_bcs), MAX_READ_GROUPS, replace=False)]) read_groups = read_groups[[_b in sel_bcs for _b in read_groups.bc]] read_groups.sort('bc', inplace=True) bc_set = set(read_groups.bc) read_groups = read_groups.groupby('bc') bc_phase_set_dict1 = sv_utils.get_barcode_phase_probs(frag_phasing, c1, cand_breaks1[0], cand_breaks1[-1], bc_set, in_ps=ps1) bc_phase_set_dict2 = sv_utils.get_barcode_phase_probs(frag_phasing, c2, cand_breaks2[0], cand_breaks2[-1], bc_set, in_ps=ps2) loci_pairs = [(_a, _b) for _a, _b in product(cand_breaks1, cand_breaks2)] read_group_dict = {} for bc, read_group in read_groups: read_group_dict[bc] = tk_sv_read_model.ReadInfo(read_group.chrom, read_group.pos, group_ids=read_group['break']) res = sv_model.em_it_away(loci_pairs, read_group_dict, ps1, ps2, bc_phase_set_dict1, bc_phase_set_dict2, em_iters=5, proximal=False) (max_lrs, max_locus, sv_type, zygosity, max_hap, support, posterior_probs) = res sv_call = tk_sv_call.SvCall.from_em_results(c1, c2, ps1, ps2, max_lrs, max_locus, sv_type, zygosity, max_hap, support, posterior_probs) final_res.append(sv_call) return final_res
def join(args, outs, chunk_defs, chunk_outs): bam_in = tk_bam.create_bam_infile(args.input) # Combine fragment h5 files in_files = [ out.fragments for out in chunk_outs if out.fragments and os.path.exists(out.fragments) ] nfrags = 0 chrom_partition = partition_chroms(bam_in.references, bam_in.lengths) if len(in_files) > 0: readers = [tenkit.hdf5.DataFrameReader(f) for f in in_files] for chrom_set in chrom_partition: chunks = [r.query_chroms(chrom_set) for r in readers] chunk = p.concat(chunks) chunk.sort(['chrom', 'start_pos', 'bc'], inplace=True) # Always save the BC as categorical chunk['bc'] = chunk['bc'].astype('category') chunk['molecule_id'] = np.arange(len(chunk), dtype=np.int32) + nfrags nfrags += len(chunk) tenkit.hdf5.append_data_frame(outs.fragments, chunk) for r in readers: r.close() tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') else: outs.fragments = None # Combine BC h5 files in_files = [ out.barcodes for out in chunk_outs if out.barcodes and os.path.exists(out.barcodes) ] if len(in_files) > 0: tenkit.hdf5.combine_data_frame_files(outs.barcodes, in_files) else: outs.barcodes = None summary = {} # Compute high-level BC summary metrics # Load BC data if outs.barcodes: bc_df = tenkit.hdf5.read_data_frame(outs.barcodes) fragment_df = tenkit.hdf5.read_data_frame( outs.fragments, query_cols=['bc', 'num_reads', 'est_len', 'chrom', 'start_pos']) bc_df.sort('bc_num_reads', inplace=True) # bin the bc counts and write a json histogram file n_reads = bc_df.bc_num_reads.values max_val = np.percentile(n_reads, 99.99) * 1.3 min_val = n_reads.min() num_bins = 400 step = math.ceil((max_val - min_val) / num_bins) bins = np.arange(min_val, max_val, step) (hist, edges) = np.histogram(n_reads, bins=bins) bc_count_hist = {int(edges[i]): hist[i] for i in range(len(bins) - 1)} # Summarize properties of n50 and n90 BC set bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads) n50_read_thresh = sum(bc_df.bc_num_reads) * 0.5 n50_bcs = bc_df[bc_df.cum_reads > n50_read_thresh] n50_fra = fragment_df[fragment_df.bc.isin(n50_bcs.bc)] n50_stats = high_level_stats("n50", n50_fra, n50_bcs) del n50_fra n90_read_thresh = sum(bc_df.bc_num_reads) * 0.1 n90_bcs = bc_df[bc_df.cum_reads > n90_read_thresh] n90_fra = fragment_df[fragment_df.bc.isin(n90_bcs.bc)] n90_stats = high_level_stats("n90", n90_fra, n90_bcs) del n90_fra for (k, v) in n50_stats.iteritems(): summary[k] = v for (k, v) in n90_stats.iteritems(): summary[k] = v # Generate a fragment length histogram fragment_df['len_bin'] = np.floor_divide( fragment_df.est_len.values, FRAG_LEN_HIST_BIN_SIZE).astype(int) * FRAG_LEN_HIST_BIN_SIZE multi_read_frags = fragment_df[fragment_df.num_reads > 1] len_bins = multi_read_frags.groupby(['len_bin']).apply(len) del multi_read_frags len_hist = {k: v for (k, v) in len_bins.iteritems()} # Write fragment length hist to json with open(outs.fragment_size, 'w') as fragment_size_file: tenkit.safe_json.dump_numpy(len_hist, fragment_size_file) # Estimate total DNA per partition by looking at hottest 1000 GEMs or GEMs w/ bc_mean_reads_per_fragment > 2, whichever is fewer hot_bcs = bc_df[np.logical_and(bc_df.bc_mean_reads_per_fragment > 2.0, bc_df.bc_num_reads > 25)] hot_bcs.sort('bc_mean_reads_per_fragment', inplace=True) if len(hot_bcs) > 50: hot_bcs = hot_bcs[-NUM_BCS_LOADING_ESTIMATE:] summary['estimated_dna_per_partition'] = round( scipy.stats.tmean( hot_bcs.bc_est_len, scipy.percentile(hot_bcs.bc_est_len, (1, 99)))) else: summary['estimated_dna_per_partition'] = None # Read-based effective diversity reads = bc_df.bc_num_reads.values sum_sq = (reads**2.0).sum() effective_diversity = tk_stats.robust_divide((reads.sum()**2.0), float(sum_sq)) summary['effective_diversity_reads'] = effective_diversity # Fragment-based effective diversity fragments = bc_df.bc_num_fragments.values sum_sq = (fragments**2.0).sum() effective_diversity = tk_stats.robust_divide((fragments.sum()**2.0), float(sum_sq)) summary['effective_diversity_fragments'] = effective_diversity else: # No fragment_size file emitted outs.fragment_size = None n50_stats = high_level_stats("n50", None, None) n90_stats = high_level_stats("n90", None, None) for (k, v) in n50_stats.iteritems(): summary[k] = v for (k, v) in n90_stats.iteritems(): summary[k] = v bc_count_hist = {} summary['estimated_dna_per_partition'] = None summary['effective_diversity_reads'] = None summary['effective_diversity_fragments'] = None with open(outs.barcode_histogram, 'w') as barcode_hist_file: tenkit.safe_json.dump_numpy(bc_count_hist, barcode_hist_file) # Write summary to json with open(outs.single_partition, 'w') as summary_file: tenkit.safe_json.dump_numpy(summary, summary_file, pretty=True)
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) in_bam = tk_bam.create_bam_infile(args.possorted_bam) cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] has_pileups = np.zeros((len(pred_df), ), dtype=np.bool) for i, (_, row) in enumerate(pred_df.iterrows()): has_clipped1 = has_too_many_clipped( in_bam, row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop1 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped2 = has_too_many_clipped( in_bam, row.chrom2, max(0, row.start2 - BREAK_EXT), row.stop2 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped = has_clipped1 and has_clipped2 if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE: has_pileups[i] = has_clipped continue cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT)) cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int) if not 'coverage_deduped' in cov.columns: cov['coverage_deduped'] = cov[sel_cols].sum(axis=1) cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped']) median_cov = np.median(cov_arr) # Rescue for deletions or duplications with breakpoints on the pileups sv_len = row.stop2 - row.start1 side_cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2), row.start1 - BREAK_EXT)) side_cov = pd.concat([ side_cov, cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT, row.stop2 + BREAK_EXT + sv_len / 2)) ], ignore_index=True) if not 'coverage_deduped' in side_cov.columns: side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1) # Ignore pileups, enough evidence for a large-scale copy number variant if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median( side_cov.coverage_deduped): continue if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median( side_cov.coverage_deduped): continue # Filter out the call if there are pileups very close to the breakpoints has_pileups[i] = len(cov_arr) > 4 and np.any( cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov) has_pileups[i] = has_pileups[i] or has_clipped pileups = pred_df[has_pileups] pred_df = pred_df[np.logical_not(has_pileups)] tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
def main(args, outs): reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] ext_cols = list(sel_cols) ext_cols.append('total_cov') out_loci = [] summary_df = None for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci): cov = reader.query((chrom, start, stop)) cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int) cov['total_cov'] = cov[sel_cols].sum(axis=1) mean_cov = np.mean(cov['total_cov']) summary_df = pd.concat([ summary_df, pd.DataFrame( { 'chrom': chrom, 'start': start, 'stop': stop, 'mean_cov': mean_cov }, index=[0]) ], ignore_index=True) # Remove very small phase sets. These tend to be single-SNP phase sets # and can result from erroneous SNPs. cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000) sum_df = cov.groupby(['bin', 'phase_set'])[ext_cols].mean().reset_index() sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov sum_df['low_hap0'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1) sum_df['low_hap1'] = np.logical_and( sum_df.total_cov < mean_cov, sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0) if not sum_df.empty: any_low = np.logical_or( sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0)) bins = np.array(sum_df['bin']) bins = np.concatenate([bins, [np.max(bins) + 1]]) pos = 0 # Get runs of 0s and 1s in any_low for bit, group in groupby(any_low): group_size = len(list(group)) group_start = bins[pos] * args.bin_size group_stop = bins[pos + group_size] * args.bin_size region_len = group_stop - group_start if bit and region_len >= args.min_len: out_loci.append((chrom, max(0, group_start - args.bin_size), group_start + args.bin_size, chrom, max(0, group_stop - args.bin_size), group_stop + args.bin_size)) pos += group_size with open(outs.loci, 'w') as f: cPickle.dump(out_loci, f) summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)