def split(args): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.variants) gt_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants) tk_sv_io.check_sv_names(gt_df) sv_df["name"] = ["call_%d" % idx for idx in range(len(sv_df))] variants_bedpe = os.path.join(os.getcwd(), "variants.bedpe") tk_sv_io.write_sv_df_to_bedpe(sv_df, variants_bedpe) nsvs = sv_df.shape[0] nbreaks_per_chunk = max(100, int(np.ceil(nsvs / 32.0))) # avoid overchunking nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk))) chunk_defs = [] for i in range(nchunks): chunk_start = i * nbreaks_per_chunk chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk) chunk_defs.append({ 'renamed_variants': variants_bedpe, 'start_idx': chunk_start, 'stop_idx': chunk_end, '__mem_gb': 12 }) if len(chunk_defs) == 0: chunk_defs = [{ 'renamed_variants': variants_bedpe, 'start_idx': 0, 'stop_idx': 0, '__mem_gb': 12 }] return {'chunks': chunk_defs, 'join': {'__mem_gb': 16}}
def read_bedpes(args): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) if not args.sv_calls2 is None: sv_df = pd.concat( [sv_df, tk_sv_io.read_sv_bedpe_to_df(args.sv_calls2)], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) return sv_df
def join(args, outs, chunk_defs, chunk_outs): out_calls = None out_pileups = None for c in chunk_outs: if not os.path.isfile(c.sv_calls): continue calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls) pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups) out_calls = pd.concat([out_calls, calls], ignore_index=True) out_pileups = pd.concat([out_pileups, pileups], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
def prepare_loci(args): """Merge and sort input lists of candidate loci.""" overlap_loci = [] # Loci based on barcode overlaps. Type of SV is unknown. if not args.overlap_loci is None: with open(args.overlap_loci, 'rb') as f: loci = cPickle.load(f) overlap_loci.extend([(x[0], x[1], x[2], x[3], x[4], x[5], None) for x in loci]) # Low depth loci. These will only be evaluated for deletions. if not args.low_depth_loci is None: del_calls = tk_sv_io.read_sv_bedpe_to_df(args.low_depth_loci) for _, row in del_calls.iterrows(): overlap_loci.append((row.chrom1, row.start1, row.stop1, row.chrom2, row.start2, row.stop2, 'DEL')) # Loci based on read-pair support. These will only be evaluated for the # type of SV supported by the readpairs. if not args.rp_calls is None: rp_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls) for _, row in rp_calls.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if not sv_type in ['DEL', 'INV', 'DUP']: sv_type = None else: sv_type = [sv_type] overlap_loci.append((row.chrom1, row.start1, row.stop1, row.chrom2, row.start2, row.stop2, sv_type)) # Sort by position and also get the sorted indices. sorted_overlap_loci = sorted(overlap_loci, key=lambda x: (x[0], x[1], x[2], x[3], x[4], x[5])) sorted_overlap_loci_idx = sorted(range(len(overlap_loci)), key=lambda x: (overlap_loci[x][0], overlap_loci[x][1], overlap_loci[x][2], overlap_loci[x][3], overlap_loci[x][4], overlap_loci[x][5])) # If there is a single source of candidate loci, coming from a BEDPE, then # keep track of the names in the BEDPE, so you can annotate SV-calls made # with the BEDPE line from which they came. if args.overlap_loci is None and args.low_depth_loci is None and not args.rp_calls is None: input_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls) input_names = list(input_calls['name']) input_names = [input_names[n] for n in sorted_overlap_loci_idx] else: input_names = None return sorted_overlap_loci, input_names
def join(args, outs, chunk_defs, chunk_outs): join_df = None non_pass_join_df = None for chunk in chunk_outs: df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls) join_df = pd.concat([join_df, df], ignore_index=True) non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df], ignore_index=True) join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) non_pass_join_df['name'] = np.arange(len(join_df), len(join_df) + len(non_pass_join_df)) tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
def join(args, outs, chunk_defs, chunk_outs): join_df = None for chunk in chunk_outs: df = sv_io.read_sv_bedpe_to_df(chunk.sv_calls) join_df = pd.concat([join_df, df], ignore_index = True) sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
def split(args): sv_df = sv_io.read_sv_bedpe_to_df(args.candidates) if len(sv_df) == 0: chunk_defs = [{'chunk_bed': None}] return {'chunks': chunk_defs} sv_df["size"] = sv_df["stop2"] - sv_df["start1"] sv_df["padding"] = np.round(sv_df["size"] * args.padding_fract + args.padding_abs).astype(np.int32) sv_df["new_start"] = np.maximum(0, sv_df["start1"] - sv_df["padding"]) sv_df["new_stop"] = sv_df["stop2"] + sv_df["padding"] sv_df["new_size"] = sv_df["new_stop"] - sv_df["new_start"] sv_df = sv_df[(sv_df["new_size"] < 5000)] sv_df.sort_values(by=['chrom1', 'start1'], inplace=True) nsvs = sv_df.shape[0] nsvs_per_chunk = max(500, int(np.ceil(nsvs / 128.0))) nchunks = int(np.ceil(nsvs / float(nsvs_per_chunk))) chunk_defs = [] for i in range(nchunks): chunk_start = i * nsvs_per_chunk chunk_end = min(nsvs, (i + 1) * nsvs_per_chunk) subset = sv_df[chunk_start:chunk_end][["chrom1", "new_start", "new_stop"]] # Figure out correct padding fn = os.path.join(os.getcwd(), "chunk_%d.bed" % i) subset.to_csv(fn, header=False, sep="\t", index=False) chunk_defs.append({'chunk_bed': fn, "__mem_gb": 3.0}) return {'chunks': chunk_defs}
def join(args, outs, chunk_defs, chunk_outs): out_df = None for chunk in chunk_outs: tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates) out_df = pd.concat([out_df, tmp_df], ignore_index=True) out_df['name'] = np.arange(len(out_df)) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
def main(args, outs): callsets = [] if args.calls1 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls1) callsets.append(c) if args.calls2 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls2) callsets.append(c) if args.calls3 is not None: c = sv_io.read_sv_bedpe_to_df(args.calls3) callsets.append(c) # Select the highest qual merged = merge_overlapping(callsets, select_widest()) sv_io.write_sv_df_to_bedpe(merged, outs.merged)
def join(args, outs, chunk_defs, chunk_outs): join_df = None for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants) join_df = pd.concat([join_df, bedpe_df], ignore_index=True) if not args.best_only: join_df['name'] = np.arange(len(join_df)) tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
def overlap_breaks(pred_loci, true_loci, min_rel_overlap=0.5): if not isinstance(pred_loci, pd.DataFrame): pred_df = tk_sv_io.read_sv_bedpe_to_df(pred_loci) else: pred_df = pred_loci if not isinstance(true_loci, pd.DataFrame): gt_df = tk_sv_io.read_sv_bedpe_to_df(true_loci) else: gt_df = true_loci gt_df.index = gt_df['name'] gt_map = bedpe_df_to_named_region_map(gt_df) pred_to_matching_true = defaultdict(set) true_to_matching_pred = defaultdict(set) for _, row in pred_df.iterrows(): if not row.chrom1 in gt_map: continue matches = gt_map[row.chrom1].overlapping_region_names( row.start1, row.stop2) this_len = row.stop2 - row.start1 if len(matches) > 0: match_df = gt_df.loc[list(matches)] lengths = np.array(match_df.stop2 - match_df.start1, dtype=np.float) ov = np.minimum(match_df.stop2, row.stop2) - np.maximum( match_df.start1, row.start1) good_matches = np.logical_and( ov / float(this_len) > min_rel_overlap, ov / lengths > min_rel_overlap) if np.any(good_matches): true_names = match_df[good_matches]['name'] for matching_true in true_names: true_to_matching_pred[matching_true].add(row['name']) pred_to_matching_true[row['name']].add(matching_true) return (pred_to_matching_true, true_to_matching_pred, {})
def main(args, outs): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) sv_df["info2"] = "SV" cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants) cnv_df["info2"] = "CNV" sv_df = pd.concat([sv_df, cnv_df], ignore_index=True) sv_df['name'] = np.arange(len(sv_df)) sv_df.sort(['chrom1', 'chrom2'], inplace=True) res_df = None for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']): tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], inplace=True) # cluster the loci in the group based on proximity groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist) # for each cluster, get the row with max qual # tmp_df.loc[g] gets the subset of tmp_df in the cluster. # then idxmax gets the max index out_df = pd.DataFrame(columns=sv_df.columns) idx = 0 for g in groups: row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()] if (tmp_df.loc[g]['info2'] == 'SV').any(): row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()] source = list(set(tmp_df.loc[g]['info2'])) row['info'] += (";SOURCE=" + ",".join(source)) out_df.loc[idx] = row idx += 1 out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True) res_df = pd.concat([res_df, out_df], ignore_index=True) tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if not out_bedpe is None: out_bedpe['name'] = np.arange(len(out_bedpe)) sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants) if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary): shutil.copyfile(chunk_outs[0].summary, outs.summary) else: outs.summary = None
def join(args, outs, chunk_defs, chunk_outs): out_bedpe = None for c in chunk_outs: if not os.path.isfile(c.sv_variants): continue in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants) if not in_bedpe is None: out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True) if out_bedpe is None: col_names = ['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name', 'qual', 'strand1', 'strand2', 'filters', 'info'] out_bedpe = pd.DataFrame(columns=col_names) out_bedpe.names = np.arange(len(out_bedpe)) out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv] tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
def merge_multiple_breaks(in_bedpes, out_bedpe, merge_win=10000, max_range=np.inf): assert (len(in_bedpes) > 0) in_bedpe_df = None for bi, bedpe in enumerate(in_bedpes): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe) assert (bedpe_df.shape[1] > 11) bedpe_df = bedpe_df.iloc[:, 0:12] # Make sure that all names from all files are unique bedpe_df['name'] = [str(n) + '_' + str(bi) for n in bedpe_df['name']] in_bedpe_df = pd.concat([in_bedpe_df, bedpe_df], ignore_index=True) return merge_breaks(in_bedpe_df, out_bedpe, merge_win=merge_win, max_range=max_range)
def get_break_groups(bedpe_df, merge_win=10000, max_range=np.inf): """A simplified version of merge_breaks""" if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i, (n, row) in enumerate(bedpe_df.iterrows()): breaks.append((row.chrom1, row.start1, row.stop1, (n, 1))) breaks.append((row.chrom2, row.start2, row.stop2, (n, 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = defaultdict(list) for i, (n, row) in enumerate(bedpe_df.iterrows()): cluster_idx1 = mem_to_cluster[(n, 1)] cluster_idx2 = mem_to_cluster[(n, 2)] cluster_pairs[(cluster_idx1, cluster_idx2)].append(n) return cluster_pairs.values()
def split(args): sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) nsvs = sv_df.shape[0] nbreaks_per_chunk = max(20, int(np.ceil(nsvs / 100.0))) # avoid overchunking nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk))) chunk_defs = [] for i in range(nchunks): chunk_start = i * nbreaks_per_chunk chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk) chunk_defs.append({ 'start_idx': chunk_start, 'stop_idx': chunk_end, '__mem_gb': 16 }) if len(chunk_defs) == 0: chunk_defs = [{'start_idx': 0, 'stop_idx': 0}] return {'chunks': chunk_defs}
def join(args, outs, chunk_defs, chunk_outs): join_df = None read_counts = {} read_counts['split'] = defaultdict(int) read_counts['pair'] = defaultdict(int) for chunk in chunk_outs: bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls) join_df = pd.concat([join_df, bedpe_df], ignore_index = True) if not os.path.isfile(chunk.discordant_read_counts): continue with open(chunk.discordant_read_counts, 'r') as f: counts = json.load(f) for t, c in counts['split'].iteritems(): read_counts['split'][t] += c for t, c in counts['pair'].iteritems(): read_counts['pair'][t] += c join_df['name'] = [str(i) for i in np.arange(len(join_df))] tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls) read_counts['split'] = dict(read_counts['split']) read_counts['pair'] = dict(read_counts['pair']) with open(args.basic_summary, 'r') as f: num_reads = float(json.load(f)['num_reads']) / 2.0 read_counts['frac_split'] = {} read_counts['frac_pair'] = {} for t, c in read_counts['split'].iteritems(): read_counts['frac_split'][t] = c / num_reads for t, c in read_counts['pair'].iteritems(): read_counts['frac_pair'][t] = c / num_reads with open(outs.discordant_read_counts, 'w') as f: f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) in_bam = tk_bam.create_bam_infile(args.possorted_bam) frac_changed = np.zeros((len(pred_df), ), dtype=np.float) for i, (_, row) in enumerate(pred_df.iterrows()): frac_changed[i] = get_frac_mapq_changed(in_bam, row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop1 + BREAK_EXT, row.chrom2, max(0, row.start2 - BREAK_EXT), row.stop2 + BREAK_EXT, min_mapq=60) pileups = pred_df[frac_changed > args.max_frac_low_mapq] pred_df = pred_df[frac_changed <= args.max_frac_low_mapq] tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
def prepare_gt(args): if args.gt_variants is None: return None true_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants) # Length of ground truth sv true_df['dist'] = tk_sv_io.get_sv_df_dists(true_df) sv_types = [] orients = [] tiers = [] # Mark genic SVs is_genic1 = np.zeros((len(true_df), ), dtype=np.int) is_genic2 = np.zeros((len(true_df), ), dtype=np.int) gene_regions = tk_reference.load_gene_boundaries(args.reference_path, protein_coding=False) for row_idx, (_, row) in enumerate(true_df.iterrows()): if not 'info' in true_df.columns: sv_types.append('UNK') orients.append('..') tiers.append(0) else: sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'DISTAL': sv_type = 'TRANS' sv_types.append(sv_type) orients.append(tk_sv_io.get_break_orientation(row.info)) tiers.append(tk_sv_io.get_tier(row.info)) is_genic1[row_idx] = int( row.chrom1 in gene_regions and bool(gene_regions[row.chrom1].overlapping_regions( row.start1, row.stop1))) is_genic2[row_idx] = int( row.chrom2 in gene_regions and bool(gene_regions[row.chrom2].overlapping_regions( row.start2, row.stop2))) true_df['break1_genic'] = is_genic1 true_df['break2_genic'] = is_genic2 # number of breakpoints overlapping genes true_df['genic_breaks'] = is_genic1 + is_genic2 # put all the un-tiered entries into the last tier tiers = np.array(tiers) if len(tiers) == 0: total_tiers = 0 else: total_tiers = np.max(tiers) tiers[tiers == 0] = total_tiers + 1 true_df['tier'] = tiers true_df['sv_type'] = sv_types true_df['orient'] = orients if not args.min_sv_len is None: # Select only intra-chromosomal or svs that have a minimum distance between breakpoints is_feasible = np.array(true_df['dist'] >= args.min_sv_len, dtype=np.bool) if not args.targets is None and not args.target_dists is None: target_regions = tk_sv_utils.bed_to_region_map(args.targets, merge=True) res = get_df_region_dist(true_df, target_regions, use_orient=True) targ_dists1, targ_dists2, targs1, targs2, _, _ = res new_starts1 = np.array(true_df.start1) new_stops1 = np.array(true_df.stop1) new_starts2 = np.array(true_df.start2) new_stops2 = np.array(true_df.stop2) for i, (t1, t2) in enumerate(zip(targs1, targs2)): if not t1[0] is None and not t2[0] is None: new_starts1[i], new_stops1[i] = t1 new_starts2[i], new_stops2[i] = t2 true_df['start1'] = new_starts1 true_df['stop1'] = new_stops1 true_df['start2'] = new_starts2 true_df['stop2'] = new_stops2 true_df['targ_dist'] = np.maximum(np.array(targ_dists1), np.array(targ_dists2)) else: true_df['targ_dist'] = np.zeros((len(true_df), ), dtype=np.int) true_df['feasible'] = is_feasible return true_df
def main(args, outs): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) bedpe_df = tk_sv_utils.get_dataframe_loc( bedpe_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) if max_insert is None: martian.throw('No Q60 reads') with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_trans = summary['far_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_FF_STR: chimera_rate_trans, tk_readpairs.TRANS_FR_STR: chimera_rate_trans, tk_readpairs.TRANS_RR_STR: chimera_rate_trans, tk_readpairs.TRANS_RF_STR: chimera_rate_trans } in_bam = tk_bam.create_bam_infile(args.input) out_quals = [] out_infos = [] out_chroms1 = [] out_starts1 = [] out_stops1 = [] out_chroms2 = [] out_starts2 = [] out_stops2 = [] for i, (_, row) in enumerate(bedpe_df.iterrows()): in_svt = tk_sv_io.get_sv_type(row.info) if row.chrom1 == row.chrom2: max_ext = min(args.break_extend, int( (row.start2 - row.stop1) / 3.0)) r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext) r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend) if r1[1] > r2[0]: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: r1 = (max(0, row.start1 - args.break_extend), row.stop1 + args.break_extend) r2 = (max(0, row.start2 - args.break_extend), row.stop2 + args.break_extend) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] readpairs = tk_readpairs.get_readpairs2(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) # Distal readpairs across the breakpoints dist_readpairs = filter( filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs) if len(dist_readpairs) > MAX_READPAIRS: sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun, max_insert, chimera_rates) if len(res_arr) == 0: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append(row['info']) else: if args.best_only: res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True) res_arr = [res_arr[0]] for (lr, num_split, num_pairs, sv_len, support_range, svt, support_readpairs) in res_arr: range1, range2 = support_range if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None: out_quals.append(row.qual + args.rp_lr_multiplier * lr) out_chroms1.append(row.chrom1) out_starts1.append(range1[0]) out_stops1.append(range1[1]) out_chroms2.append(row.chrom2) out_starts2.append(range2[0]) out_stops2.append(range2[1]) if svt != in_svt and in_svt != 'TRANS': in_svt = 'UNK' else: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append( tk_sv_io.update_info( row['info'], ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'], [num_pairs, num_split, lr, svt, in_svt])) in_bam.close() if args.best_only: out_names = [n for n in bedpe_df['name']] else: out_names = np.arange(len(bedpe_df)) out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1, out_chroms2, out_starts2, out_stops2, out_names, out_quals, out_infos) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.renamed_variants) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) if not args.gt_variants is None: true_df = prepare_gt(args) true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') else: true_df = None #### Get matches between this chunk of the calls and the ground truth max_detect_dist = np.max(np.array(args.detect_dists)) res = get_matches(pred_df, true_df, max_detect_dist, args.min_rel_overlap) pred_to_match, true_to_match, _ = res #### Apply filters if len(pred_df) > 0: # Loading all these files can take awhile. Don't do it if there are no SVs to analyze. # blacklist and segdups files can come from 3 places, in this order of precedence: # 1. mro argument sv_blacklist_regions # 2. <reference_path>/regions/sv_blacklist.bed (or segdups.bed) # 3. <tenkit install>/sv_data/<genome>/default_sv_blacklist.bed (accessed by tenkit.constants.find_sv_blacklist) if os.path.exists(tk_reference.get_sv_blacklist(args.reference_path)): blacklist_file = tk_reference.get_sv_blacklist(args.reference_path) else: blacklist_file = lr_gt.get_genomic_track(args.sv_blacklist_regions, args.blacklist_mode, args.reference_path, "default_blacklist.bed") # This will merge overlapping blacklist regions black_regions = tk_sv_utils.bed_to_region_map(blacklist_file, merge=True) # Match each region in black_regions to a set of entries from the bed # file that overlap it. This is done so we can output the names of # entries that were used to blacklist each sv. black_region_names = get_region_names(blacklist_file, black_regions) # compute the distance between the breakpoints and the blacklist # elements. Get the distance together with the names of the closest # blacklist elements. res = get_df_region_dist(pred_df, black_regions, black_region_names) black_dists1, black_dists2, _, _, black_names1, black_names2 = res if os.path.exists(tk_reference.get_segdups(args.reference_path)): seg_dups_file = tk_reference.get_segdups(args.reference_path) else: seg_dups_file = lr_gt.get_genomic_track(args.seg_dups, args.segdup_mode, args.reference_path, "default_segdups.bedpe") # from call to matching seg dups seg_dup_calls, _, _ = tk_sv_utils.compare_breaks( pred_df, seg_dups_file, max_dist=args.seg_dup_min_dist) seg_dup_regions = tk_sv_utils.bedpe_to_region_map(seg_dups_file, merge=True) all_bad_regions = tk_sv_utils.merge_region_maps( black_regions, seg_dup_regions) else: black_dists1 = None black_dists2 = None black_names1 = None black_names2 = None seg_dup_calls = {} all_bad_regions = None pred_df, min_qv = add_filters(pred_df, pred_to_match, black_dists1, black_dists2, black_names1, black_names2, seg_dup_calls, all_bad_regions, args) with open(re.sub('.json', '.pickle', outs.summary), 'wb') as f: cPickle.dump(pred_to_match, f) cPickle.dump(true_to_match, f) cPickle.dump((pred_df, min_qv), f)
def compare_multiple_breaks(in_bedpes, sample_names, out_bedpe, merge_win=0, max_range=np.inf): """Compares multiple BEDPE files. Args: - in_bedpes: A list of BEDPE files to compare. - sample_names: A list of the same size with unique names for the input samples. - out_bedpe: Where union BEDPE will be written. Return value: A DataFrame with the union of calls and information about which calls are present in which input files. This DataFrame will have one entry per call in the union and will include (among other things) columns <sample>_qual, <sample>_filtered, <sample>_correct, and <sample>_dist for each of the input BEDPEs. """ assert (len(sample_names) == len(in_bedpes)) # Merge all the input files. This will get rid of redundant entries. # The quality in the output will be the maximum quality across all files. merged_df = merge_multiple_breaks(in_bedpes, out_bedpe, merge_win=merge_win, max_range=max_range) num_merged = len(merged_df) # Map the name of each entry in the union to its index in the DataFrame. name_to_ind = {} for i, n in enumerate(merged_df['name']): name_to_ind[n] = i new_filters = [set([]) for i in range(num_merged)] new_matches = [set([]) for i in range(num_merged)] # For each of the input BEDPEs find which of the entries in the union it # overlaps. This is somewhat duplicated work, but it's simpler this way. for sample, bedpe in zip(sample_names, in_bedpes): in_df = tk_sv_io.read_sv_bedpe_to_df(bedpe) name_to_ind2 = {} for i, n in enumerate(in_df['name']): name_to_ind2[n] = i matched_qual = np.zeros((num_merged, ), dtype=np.int) is_correct = np.zeros((num_merged, ), dtype=np.bool) is_filtered = np.zeros((num_merged, ), dtype=np.bool) tmp_dist = np.zeros((num_merged, ), dtype=np.int) matched_names = ['' for i in range(num_merged)] # merged_to_this will be a dictionary from a name in the union to a set # of names in the input bedpe merged_to_this, _, _ = compare_breaks(merged_df, bedpe, max_dist=merge_win) for name1, name2_set in merged_to_this.iteritems(): ind1 = name_to_ind[name1] matched_names[ind1] = ';'.join([str(s) for s in name2_set]) for name2 in name2_set: ind2 = name_to_ind2[name2] matched_qual[ind1] = max(matched_qual[ind1], in_df.iloc[ind2]['qual']) match = tk_sv_io.extract_sv_info(in_df.iloc[ind2]['info'], ['MATCHES'])[0] is_match_correct = (match != '.' and match != '' and not match is None) if is_match_correct: new_matches[ind1].add(match) # Never set back to False if it was set to true. is_correct[ind1] = True is_filtered[ind1] = in_df.iloc[ind2]['filters'] != '.' if in_df.iloc[ind2]['filters'] != '.': new_filters[ind1] = new_filters[ind1].union( set(in_df.iloc[ind2]['filters'].split(';'))) if in_df.iloc[ind2]['chrom1'] != in_df.iloc[ind2]['chrom2']: tmp_dist[ind1] = -1 else: tmp_dist[ind1] = in_df.iloc[ind2]['start2'] - in_df.iloc[ ind2]['stop1'] merged_df[str(sample) + '_matches'] = matched_names merged_df[str(sample) + '_qual'] = matched_qual merged_df[str(sample) + '_correct'] = is_correct merged_df[str(sample) + '_filtered'] = is_filtered merged_df[str(sample) + '_dist'] = tmp_dist info_strs = ['.' for i in range(num_merged)] filter_strs = ['.' for i in range(num_merged)] for i in range(num_merged): match_str = ','.join( new_matches[i]) if len(new_matches[i]) > 0 else '.' info_strs[i] = tk_sv_io.update_info('.', ['MATCHES'], [match_str]) filter_strs[i] = ';'.join( new_filters[i]) if len(new_filters[i]) > 0 else '.' merged_df['qual'] = np.array(np.max( merged_df[[str(s) + '_qual' for s in sample_names]], axis=1), dtype=np.int) merged_df['filters'] = filter_strs merged_df['info'] = info_strs merged_df.sort( ['qual', 'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'], ascending=[0, 1, 1, 1, 1, 1, 1], inplace=True) return merged_df
def merge_breaks(bedpe_df, out_bedpe, merge_win=10000, max_range=np.inf, max_nmates=np.inf, cluster_qual_factor=0.2): """Merges a set of SVs into a non-redundant set. Args: - bedpe_df: Either a bedpe file or a DataFrame like the one returned by tk_sv_io.read_sv_bedpe_to_df. - out_bedpe: Path to file where output will be written. - merge_win: Breakpoints will be merged if they are within this distance from each other. Two SVs will be merged if both their breakpoints can be merged. - max_range: See max_range field of cluster_loci. - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1, and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so breakpoint clusters), of the first breakpoint of an SV. SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the output. Return value: The output BEDPE. """ if not isinstance(bedpe_df, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df) else: bedpe_df = pd.DataFrame(bedpe_df) breaks = [] for i in range(bedpe_df.shape[0]): breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1], bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1))) breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4], bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2))) _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range) cluster_pairs = {} for i in range(bedpe_df.shape[0]): name = bedpe_df.iloc[i]['name'] cluster_idx1 = mem_to_cluster[(name, 1)] cluster_idx2 = mem_to_cluster[(name, 2)] if not (cluster_idx1, cluster_idx2) in cluster_pairs: cluster_pairs[(cluster_idx1, cluster_idx2)] = [i] else: old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0] # Make sure the old and the new pair have breaks on the same chromosomes assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0]) assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3]) cluster_pairs[(cluster_idx1, cluster_idx2)].append(i) new_cluster_pairs = {} cluster_dist_ratio = {} for p, pos_list in cluster_pairs.iteritems(): pos_arr = np.array(pos_list) tmp_df = get_dataframe_loc(bedpe_df, pos_arr) quals = np.array(tmp_df.qual) best_call = pos_arr[np.argmax(quals)] close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0] close_df = get_dataframe_loc(tmp_df, close_calls) same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[ best_call]['chrom1'] min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1) max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[ best_call]['stop1'] new_cluster_pairs[p] = best_call if not same_chrom or max_break_dist > MAX_FRAG_SIZE: cluster_dist_ratio[p] = '.' elif min_break_dist <= 0: cluster_dist_ratio[p] = float('NaN') else: cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist cluster_pairs = new_cluster_pairs def clusters_close(i, j): chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[ i, 1], bedpe_df.iloc[i, 2] chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[ i, 4], bedpe_df.iloc[i, 5] next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[ j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2] next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[ j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5] dist1 = max(next_start1 - stop1, start1 - next_stop1) dist2 = max(next_start2 - stop2, start2 - next_stop2) return (chrom1 == next_chrom1 and chrom2 == next_chrom2 and dist1 <= merge_win and dist2 <= merge_win) # The "chain-breaking" in cluster_loci might still leave some redundancy. # In particular, we might leave some almost touching clusters that were # separated only because of chain-breaking. Do a second round of clustering # where you go through consecutive pairs of cluster and merge them if they're merge-able. new_cluster_pairs = {} for (cluster1, cluster2) in sorted(cluster_pairs.keys()): if cluster_pairs[(cluster1, cluster2)] == -1: continue # Consider all neighboring clusters after this cluster. # Notice that the cluster indices are sorted by genomic coordinates. neigh_clusters = [ (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1), (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1) ] idx = cluster_pairs[(cluster1, cluster2)] # Best cluster among neighboring clusters max_cluster = ((cluster1, cluster2), idx) for next_cluster1, next_cluster2 in neigh_clusters: if not (next_cluster1, next_cluster2) in cluster_pairs: continue if cluster_pairs[(next_cluster1, next_cluster2)] == -1: continue next_idx = cluster_pairs[(next_cluster1, next_cluster2)] if clusters_close(idx, next_idx): cluster_pairs[(next_cluster1, next_cluster2)] = -1 if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']: max_cluster = ((next_cluster1, next_cluster2), next_idx) new_cluster_pairs[max_cluster[0]] = max_cluster[1] cluster_pairs = new_cluster_pairs # Now compute the number of mate breakpoints for each cluster num_mates = {} for (cluster1, cluster2) in cluster_pairs.keys(): if not cluster1 in num_mates: num_mates[cluster1] = 0 if not cluster2 in num_mates: num_mates[cluster2] = 0 num_mates[cluster1] += 1 if cluster2 != cluster1: num_mates[cluster2] += 1 sel_loc = [] new_info_strs = [] for (cluster1, cluster2) in sorted(cluster_pairs.keys()): sv_loc = cluster_pairs[(cluster1, cluster2)] if num_mates[cluster1] > max_nmates and num_mates[ cluster2] > max_nmates: continue sel_loc.append(sv_loc) new_info_strs.append( tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'], ['NMATES1', 'NMATES2', 'RESOLUTION'], [ num_mates[cluster1], num_mates[cluster2], cluster_dist_ratio[(cluster1, cluster2)] ])) if len(sel_loc) > 0: bedpe_df = bedpe_df.iloc[sel_loc] bedpe_df['info'] = new_info_strs else: bedpe_df = pd.DataFrame(columns=bedpe_df.columns) if not out_bedpe is None: tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe) return bedpe_df
def compare_breaks(pred_loci, true_loci=None, max_dist=100, window_loci=None): """ pred_file: BEDPE file with sv calls or pandas DataFrame as returned by tk_sv_io.read_sv_bedpe_to_df true_file: BEDPE file with ground truth variants (or other set of variants against which pred_file will be compared) max_dist: maximum distance between a true and a predicted breakpoint in order to say that they overlap window_loci: list of tuples (chrom, starts, stops), where chrom is a chromosome name and starts/stops are lists/arrays of start and ending positions. If this is provided, true svs that completely fall within such a locus will be marked as "filtered" (i.e. not detectable). For example, these can be the windows used for detecting svs. An SV that lies completely within a single window cannot be detected. """ if true_loci is None or pred_loci is None: return ({}, {}, set([])) ###### Read predicted breakpoints and extend them by max_dist pred_breaks1 = [] pred_breaks2 = [] if not isinstance(pred_loci, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(pred_loci) else: bedpe_df = pred_loci for n, row in bedpe_df.iterrows(): break1 = (row.chrom1, max(0, row.start1 - max_dist), row.stop1 + max_dist, row['name']) break2 = (row.chrom2, max(0, row.start2 - max_dist), row.stop2 + max_dist, row['name']) if break1 > break2: break1, break2 = break2, break1 pred_breaks1.append(break1) pred_breaks2.append(break2) pred_regions1 = loci_to_named_region_map(pred_breaks1, singletons=True) pred_regions2 = loci_to_named_region_map(pred_breaks2, singletons=True) ###### Read provided loci regions = loci_to_region_map(window_loci) ###### Read true svs filtered_svs = set([]) # set of true svs that are non-detectable if not isinstance(true_loci, pd.DataFrame): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(true_loci) else: bedpe_df = true_loci true_breaks1 = [] true_breaks2 = [] for n, row in bedpe_df.iterrows(): name = row['name'] chrom1, start1, stop1 = row.chrom1, row.start1, row.stop1 chrom2, start2, stop2 = row.chrom2, row.start2, row.stop2 break1 = (chrom1, start1, stop1, name) break2 = (chrom2, start2, stop2, name) if break1 > break2: break1, break2 = break2, break1 is_filtered = False if not regions is None and chrom1 == chrom2 and chrom1 in regions: # SV is filtered if both its breakpoints are on the same window ovs1 = regions[chrom1].overlapping_regions(start1, stop1) ovs2 = regions[chrom2].overlapping_regions(start2, stop2) if len(set(ovs1).intersection(set(ovs2))) > 0: is_filtered = True filtered_svs.add(name) if not is_filtered: true_breaks1.append(break1) true_breaks2.append(break2) true_regions1 = loci_to_named_region_map(true_breaks1, singletons=True) true_regions2 = loci_to_named_region_map(true_breaks2, singletons=True) ###### Get overlaps bewtween predicted and true breakpoints mapping_break1 = get_region_overlaps(pred_regions1, true_regions1) mapping_break2 = get_region_overlaps(pred_regions2, true_regions2) pred_to_matching_true = {} true_to_matching_pred = {} for pred_name, matched in mapping_break1.iteritems(): if not pred_name in mapping_break2: # There was a match only for one of the breakpoints of this predicted sv. continue for true_name in matched: if true_name in mapping_break2[pred_name]: s1 = pred_to_matching_true.setdefault(pred_name, set([])) s1.add(true_name) s1 = true_to_matching_pred.setdefault(true_name, set([])) s1.add(pred_name) return (pred_to_matching_true, true_to_matching_pred, filtered_svs)
def main(args, outs): pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls) pred_df = tk_sv_utils.get_dataframe_loc( pred_df, list(range(args.start_idx, args.stop_idx))) in_bam = tk_bam.create_bam_infile(args.possorted_bam) cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage) sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2'] has_pileups = np.zeros((len(pred_df), ), dtype=np.bool) for i, (_, row) in enumerate(pred_df.iterrows()): has_clipped1 = has_too_many_clipped( in_bam, row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop1 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped2 = has_too_many_clipped( in_bam, row.chrom2, max(0, row.start2 - BREAK_EXT), row.stop2 + BREAK_EXT, max_clipped_frac=args.max_clipped_frac) has_clipped = has_clipped1 and has_clipped2 if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE: has_pileups[i] = has_clipped continue cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT)) cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int) if not 'coverage_deduped' in cov.columns: cov['coverage_deduped'] = cov[sel_cols].sum(axis=1) cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped']) median_cov = np.median(cov_arr) # Rescue for deletions or duplications with breakpoints on the pileups sv_len = row.stop2 - row.start1 side_cov = cov_reader.query( (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2), row.start1 - BREAK_EXT)) side_cov = pd.concat([ side_cov, cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT, row.stop2 + BREAK_EXT + sv_len / 2)) ], ignore_index=True) if not 'coverage_deduped' in side_cov.columns: side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1) # Ignore pileups, enough evidence for a large-scale copy number variant if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median( side_cov.coverage_deduped): continue if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median( side_cov.coverage_deduped): continue # Filter out the call if there are pileups very close to the breakpoints has_pileups[i] = len(cov_arr) > 4 and np.any( cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov) has_pileups[i] = has_pileups[i] or has_clipped pileups = pred_df[has_pileups] pred_df = pred_df[np.logical_not(has_pileups)] tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)