def get_df_region_dist(sv_df, regions, region_names=None, use_orient=False): """Computes the distance between the breakpoints of the BEDPE (read as dataframe) and a set of regions. region_names is a dict (start, stop) -> name.""" dists1 = np.inf * np.ones((len(sv_df), )) dists2 = np.inf * np.ones((len(sv_df), )) regions1 = [(None, None) for i in range(len(sv_df))] regions2 = [(None, None) for i in range(len(sv_df))] matched_names1 = ['.' for i in range(len(sv_df))] matched_names2 = ['.' for i in range(len(sv_df))] for i, (_, row) in enumerate(sv_df.iterrows()): if use_orient: sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'DEL': orient = '-+' elif sv_type == 'DUP': orient = '+-' elif sv_type == 'INV': orient = '..' else: orient = tk_sv_io.get_break_orientation(row.info) if (orient == '..' and sv_type != 'INV') or sv_type == 'UNK' or sv_type == 'INS': continue orient1 = tk_regions.Dirs.from_str(orient[0]) orient2 = tk_regions.Dirs.from_str(orient[1]) else: orient1, orient2 = (None, None) chrom1, chrom2 = row.chrom1, row.chrom2 if chrom1 in regions: s1, e1, d1 = regions[chrom1].get_closest_region_to_region( row.start1, row.stop1, direction=orient1) if not s1 is None: d1 = int(d1) regions1[i] = (s1, e1) if not region_names is None and (s1, e1) in region_names: matched_names1[i] = ','.join(list(region_names[(s1, e1)])) else: d1 = np.inf if chrom2 in regions: s2, e2, d2 = regions[chrom2].get_closest_region_to_region( row.start2, row.stop2, direction=orient2) if not s2 is None: d2 = int(d2) regions2[i] = (s2, e2) if not region_names is None and (s2, e2) in region_names: matched_names2[i] = ','.join(list(region_names[(s2, e2)])) else: d2 = np.inf dists1[i] = d1 dists2[i] = d2 return (dists1, dists2, regions1, regions2, matched_names1, matched_names2)
def prepare_loci(args): """Merge and sort input lists of candidate loci.""" overlap_loci = [] # Loci based on barcode overlaps. Type of SV is unknown. if not args.overlap_loci is None: with open(args.overlap_loci, 'rb') as f: loci = cPickle.load(f) overlap_loci.extend([(x[0], x[1], x[2], x[3], x[4], x[5], None) for x in loci]) # Low depth loci. These will only be evaluated for deletions. if not args.low_depth_loci is None: del_calls = tk_sv_io.read_sv_bedpe_to_df(args.low_depth_loci) for _, row in del_calls.iterrows(): overlap_loci.append((row.chrom1, row.start1, row.stop1, row.chrom2, row.start2, row.stop2, 'DEL')) # Loci based on read-pair support. These will only be evaluated for the # type of SV supported by the readpairs. if not args.rp_calls is None: rp_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls) for _, row in rp_calls.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if not sv_type in ['DEL', 'INV', 'DUP']: sv_type = None else: sv_type = [sv_type] overlap_loci.append((row.chrom1, row.start1, row.stop1, row.chrom2, row.start2, row.stop2, sv_type)) # Sort by position and also get the sorted indices. sorted_overlap_loci = sorted(overlap_loci, key=lambda x: (x[0], x[1], x[2], x[3], x[4], x[5])) sorted_overlap_loci_idx = sorted(range(len(overlap_loci)), key=lambda x: (overlap_loci[x][0], overlap_loci[x][1], overlap_loci[x][2], overlap_loci[x][3], overlap_loci[x][4], overlap_loci[x][5])) # If there is a single source of candidate loci, coming from a BEDPE, then # keep track of the names in the BEDPE, so you can annotate SV-calls made # with the BEDPE line from which they came. if args.overlap_loci is None and args.low_depth_loci is None and not args.rp_calls is None: input_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls) input_names = list(input_calls['name']) input_names = [input_names[n] for n in sorted_overlap_loci_idx] else: input_names = None return sorted_overlap_loci, input_names
def merge_calls_and_gt(call_df, gt_df, call_to_gt): if not gt_df is None: gt_df.index = gt_df['name'] else: call_to_gt = {} out_call_df = None for _, row in call_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) orient = tk_sv_io.get_break_orientation(row.info) row['orient'] = orient # revert sv type name from DISTAL to TRANS to match ground truth # conventions if sv_type == 'DISTAL': sv_type = 'TRANS' row['sv_type'] = sv_type matches = list(call_to_gt.get(row['name'], [None])) # One output row per match for m in matches: row['match'] = m if not m is None and not gt_df is None: x = gt_df.loc[m] row['match_dist'] = max( dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1, x.stop1), dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2, x.stop2)) else: row['match_dist'] = float('NaN') out_call_df = pd.concat( [out_call_df, pd.DataFrame([row])], ignore_index=True) if not gt_df is None: out_call_df = pd.merge(out_call_df, gt_df, left_on='match', right_on='name', how='outer', suffixes=['', '_gt']) out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True) out_call_df.sort('name', inplace=True) return out_call_df
def prepare_gt(args): if args.gt_variants is None: return None true_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants) # Length of ground truth sv true_df['dist'] = tk_sv_io.get_sv_df_dists(true_df) sv_types = [] orients = [] tiers = [] # Mark genic SVs is_genic1 = np.zeros((len(true_df), ), dtype=np.int) is_genic2 = np.zeros((len(true_df), ), dtype=np.int) gene_regions = tk_reference.load_gene_boundaries(args.reference_path, protein_coding=False) for row_idx, (_, row) in enumerate(true_df.iterrows()): if not 'info' in true_df.columns: sv_types.append('UNK') orients.append('..') tiers.append(0) else: sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'DISTAL': sv_type = 'TRANS' sv_types.append(sv_type) orients.append(tk_sv_io.get_break_orientation(row.info)) tiers.append(tk_sv_io.get_tier(row.info)) is_genic1[row_idx] = int( row.chrom1 in gene_regions and bool(gene_regions[row.chrom1].overlapping_regions( row.start1, row.stop1))) is_genic2[row_idx] = int( row.chrom2 in gene_regions and bool(gene_regions[row.chrom2].overlapping_regions( row.start2, row.stop2))) true_df['break1_genic'] = is_genic1 true_df['break2_genic'] = is_genic2 # number of breakpoints overlapping genes true_df['genic_breaks'] = is_genic1 + is_genic2 # put all the un-tiered entries into the last tier tiers = np.array(tiers) if len(tiers) == 0: total_tiers = 0 else: total_tiers = np.max(tiers) tiers[tiers == 0] = total_tiers + 1 true_df['tier'] = tiers true_df['sv_type'] = sv_types true_df['orient'] = orients if not args.min_sv_len is None: # Select only intra-chromosomal or svs that have a minimum distance between breakpoints is_feasible = np.array(true_df['dist'] >= args.min_sv_len, dtype=np.bool) if not args.targets is None and not args.target_dists is None: target_regions = tk_sv_utils.bed_to_region_map(args.targets, merge=True) res = get_df_region_dist(true_df, target_regions, use_orient=True) targ_dists1, targ_dists2, targs1, targs2, _, _ = res new_starts1 = np.array(true_df.start1) new_stops1 = np.array(true_df.stop1) new_starts2 = np.array(true_df.start2) new_stops2 = np.array(true_df.stop2) for i, (t1, t2) in enumerate(zip(targs1, targs2)): if not t1[0] is None and not t2[0] is None: new_starts1[i], new_stops1[i] = t1 new_starts2[i], new_stops2[i] = t2 true_df['start1'] = new_starts1 true_df['stop1'] = new_stops1 true_df['start2'] = new_starts2 true_df['stop2'] = new_stops2 true_df['targ_dist'] = np.maximum(np.array(targ_dists1), np.array(targ_dists2)) else: true_df['targ_dist'] = np.zeros((len(true_df), ), dtype=np.int) true_df['feasible'] = is_feasible return true_df
def add_filters(pred_df, pred_to_match, black_dists1, black_dists2, black_names1, black_names2, seg_dup_calls, all_bad_regions, args): if not args.targets is None: min_call_qv = args.min_call_qv_target else: min_call_qv = args.min_call_qv_wgs if args.coverage is None: # used for WGS max_bc_cov = SV_DEFAULT_MAX_BC_COV bc_mean_depth = 200 else: # used for exome with open(args.coverage, 'r') as f: cov_res = json.load(f) bc_summary_depth_info = cov_res['summary_bc_depth_info'] bc_mean_depth, _, _ = get_depth_info_json(bc_summary_depth_info) max_bc_cov = args.max_bc_cov_factor * bc_mean_depth if args.keep_filters: filter_strs = [s for s in pred_df.filters] else: filter_strs = ['.' for i in range(len(pred_df))] info_strs = [s for s in pred_df['info']] rps = np.zeros((len(pred_df), ), dtype=np.int) def get_cov_frac(black_regions, chrom, start, stop): regions = tk_sv_utils.strictly_overlapping_regions( black_regions, chrom, start, stop) tot_black = np.sum([r[1] - r[0] for r in regions]) tot_len = float(stop - start) black_frac = tk_stats.robust_divide(tot_black, tot_len) return black_frac for i, (_, row) in enumerate(pred_df.iterrows()): npairs = tk_sv_io.get_npairs(row['info']) nsplit = tk_sv_io.get_nsplit(row['info']) rps[i] = npairs + nsplit sv_type = tk_sv_io.get_sv_type(row['info']) name = row['name'] qual = row.qual ####### Filtering for read-pair calls ####### frac_on_hap = tk_sv_io.extract_sv_info(row.info, ['FRAC_HAP_SUPPORT'])[0] allelic_frac = tk_sv_io.extract_sv_info(row.info, ['HAP_ALLELIC_FRAC'])[0] if allelic_frac != '': allelic_frac = float(allelic_frac) if args.is_germline is None: if qual < min_call_qv: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not args.min_allelic_frac is None and not frac_on_hap is None and \ frac_on_hap != '' and float(frac_on_hap) < args.min_allelic_frac: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not args.min_allelic_frac is None and allelic_frac != '' and \ float(allelic_frac) < args.min_allelic_frac: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) elif args.targets is None: if args.is_germline: martian.log_info('Mean barcode depth {}'.format(bc_mean_depth)) min_call_qv = max(min_call_qv, bc_mean_depth / 10.0) martian.log_info( 'Support cutoff: {} barcodes'.format(min_call_qv)) enough_bcs = qual >= min_call_qv is_good = allelic_frac > 0.8 or (sv_type == 'INV' and allelic_frac > 0.6) is_good = is_good and enough_bcs if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) else: min_call_qv = max(min_call_qv, 4) is_good = allelic_frac > 0.6 and qual >= min_call_qv if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) else: if args.is_germline: # Harder to get confident support in Exome min_call_qv = max(min_call_qv, bc_mean_depth / 10.0) martian.log_info( 'Support cutoff: {} barcodes'.format(min_call_qv)) # Apply a very lenient filter on allelic fraction because lots of barcodes can be unphased is_good = qual >= min_call_qv and allelic_frac > 0.05 af = tk_sv_io.extract_sv_info(row.info, ['ALLELIC_FRAC'])[0] if af != '': af = float(af) is_good = is_good and af > 0.04 else: min_call_qv = max(min_call_qv, 4) is_good = qual >= min_call_qv if not is_good: filter_strs[i] = tk_sv_io.update_filters( filter_strs[i], 'LOWQ', None) if not black_dists1 is None: chrom1, chrom2 = row.chrom1, row.chrom2 black_dist1, black_dist2 = black_dists1[i], black_dists2[i] if chrom1 == chrom2: if chrom1 in all_bad_regions: black_frac = get_cov_frac(all_bad_regions, chrom1, row.stop1, row.start2) else: black_frac = 0.0 else: black_frac = float('NaN') else: black_dist1 = np.inf black_dist2 = np.inf black_frac = float('NaN') filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_DIST', min(black_dist1, black_dist2), args.min_dist_from_black) filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_FRAC', black_frac, 0, args.max_frac_black) bname1 = '.' bname2 = '.' if black_dist1 < args.min_dist_from_black or re.search( 'BLACK_FRAC', filter_strs[i]): bname1 = black_names1[i] if black_dist2 < args.min_dist_from_black or re.search( 'BLACK_FRAC', filter_strs[i]): bname2 = black_names2[i] if name in seg_dup_calls: filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'SEG_DUP', None) seg_dup_match = ','.join(list(seg_dup_calls[name])) else: seg_dup_match = '.' nbcs1 = tk_sv_io.get_nbcs1(row.info) nbcs2 = tk_sv_io.get_nbcs2(row.info) if not nbcs1 is None and not nbcs2 is None and (nbcs1 > max_bc_cov or nbcs2 > max_bc_cov): filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'HIGH_BC_COV', None) filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'READ_SUPPORT', npairs + nsplit, min_val=args.min_read_support) match_str = ','.join([str(s) for s in pred_to_match.get(name, '.')]) if not args.targets is None: # Disable orientation reporting in exome info_strs[i] = tk_sv_io.update_info(info_strs[i], ['ORIENT'], [None]) info_strs[i] = tk_sv_io.update_info(info_strs[i], [ 'BLACK_DIST1', 'BLACK_DIST2', 'BLACK_FRAC', 'BLACK1', 'BLACK2', 'MATCHES', 'SEG_DUP' ], [ black_dist1, black_dist2, black_frac, bname1, bname2, match_str, seg_dup_match ]) pred_df['filters'] = filter_strs pred_df['info'] = info_strs pred_df['read_support'] = rps return pred_df, min_call_qv
def join(args, outs, chunk_defs, chunk_outs): pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs) # Change TRANS type to DISTAL. This change will only # affect the type reported not the names of the metrics. new_info = [] for _, row in pred_df.iterrows(): sv_type = tk_sv_io.get_sv_type(row.info) if sv_type == 'TRANS': sv_type = 'DISTAL' new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type])) pred_df['info'] = new_info if not true_df is None: true_df.to_csv(outs.feasible_gt, index=False, header=True, sep='\t', na_rep='NaN') ##### Write BEDPE/VCF outputs tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates) source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format( martian.get_pipelines_version()) sample_id = 'sample' if args.sample_id is None else args.sample_id tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id, source_str, args.reference_path) # this will sort and gzip tk_sv_io.index_sv_vcf(outs.svs.strip(".gz")) outs.svs_index = outs.svs + '.tbi' # delete the non-gzipped file os.remove(outs.svs.strip('.gz')) if not pred_df.empty: call_df = pred_df[np.logical_or(pred_df['filters'] == '.', pred_df['filters'] == "PASS")] else: call_df = None tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls) # Annotate each call with the matching ground truth svs. The resulting # dataframe might have multiple rows for the same call if there were multiple # matching ground truth svs. martian.log_info("merging calls and gt") if not pred_df.empty: pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match) martian.log_info("writing call_tsv") pred_df.to_csv(outs.call_tsv, index=False, header=True, sep='\t', na_rep='NaN') pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))] max_dists = sorted(np.array(args.detect_dists)) gt_sv_types = get_all_sv_types(true_df) call_sv_types = get_all_sv_types(pred_df) if not true_df is None: # Use the default MAX_PPV_TIER unless this is greater than the maximum tier # present in the data. max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier)) # Use the default unless this is smaller than the minimum tier present in # the data. max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier)) else: max_ppv_tier = 1 max_sens_tier = 1 tiers = [max_ppv_tier, max_sens_tier] # All combinations of filters in ground truth and call set if not args.targets is None and not args.target_dists is None: target_dists = list(sorted(np.array(args.target_dists, dtype=np.float))) target_dists.append(float('NaN')) else: target_dists = [float('NaN')] combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers, [True, False], call_sv_types, max_dists) metrics = defaultdict(list) gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier'] call_filters = ['call_filtered', 'call_sv_type', 'match_dist'] for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type, dist) in combs: if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type: continue metrics['genic_breaks'].append(genic_breaks) metrics['target_dist'].append(tdist) metrics['gt_sv_type'].append(gt_sv_type) metrics['tier'].append(tier) metrics['call_filtered'].append(is_filtered) metrics['call_sv_type'].append(call_sv_type) metrics['match_dist'].append(dist) if true_df is None: sel_true_df = None else: sel_true_df = true_df if gt_sv_type != 'NA': sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type] if not np.isnan(tdist): sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist] sel_true_df = sel_true_df[sel_true_df.tier <= tier] # Restrict to genic or non-genic or take everything if this is None. if not genic_breaks is None: sel_true_df = sel_true_df[sel_true_df.genic_breaks == genic_breaks] if len(sel_true_df) == 0: sel_true_df = None sel_pred_df = pred_df if is_filtered and not pred_df.empty: sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') | (sel_pred_df.filters == 'PASS')] if call_sv_type != 'NA' and not pred_df.empty: sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type] if not pred_df.empty and (args.min_rel_overlap is None or args.min_rel_overlap == 0): # Do not apply thi filter if the matching is done based on overlap. sel_pred_df = sel_pred_df[np.logical_or( np.isnan(sel_pred_df.match_dist), sel_pred_df.match_dist <= dist)] add_metrics(sel_pred_df, sel_true_df, metrics) column_names = gt_filters column_names.extend(call_filters) other_names = set(metrics.keys()).difference(set(column_names)) column_names.extend(other_names) metric_df = pd.DataFrame(metrics) metric_df = metric_df[column_names] martian.log_info("writing summary tsv") metric_df.to_csv(outs.summary_tsv, index=False, header=True, sep='\t', na_rep='NaN') short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier, max_sens_tier, args) if not args.call_summary is None: with open(args.call_summary, 'r') as in_summary_fn: in_summary = json.load(in_summary_fn) for key, val in in_summary.iteritems(): short_metrics[key] = val short_metrics['min_qv'] = min_qv with open(outs.summary, 'w') as out_file: out_file.write( tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
def main(args, outs): sv_df = read_bedpes(args) sv_df = tk_sv_utils.get_dataframe_loc( sv_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) print >> sys.stderr, 'max insert', max_insert if max_insert is None: tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls) tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.non_pass_sv_calls) return with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_STR: summary['far_chimera_rate'] } in_bam = tk_bam.create_bam_infile(args.possorted_bam) frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) pass_calls = [] non_pass_calls = [] for i, (_, row) in enumerate(sv_df.iterrows()): sv_type = tk_sv_io.get_sv_type(row.info) middle = int(0.5 * (row.stop1 + row.start2)) # Bail out on all non deletions if sv_type != tk_readpairs.DEL_STR: continue if row.chrom1 == row.chrom2: r1 = (max(0, row.start1 - args.break_pad), min(middle, row.stop1 + args.break_pad)) r2 = (max(middle, row.start2 - args.break_pad), row.stop2 + args.break_pad) if row.start2 - row.stop1 > 4 * args.break_pad: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: r1 = (max(0, row.start1 - args.break_pad), row.stop1 + args.break_pad) r2 = (max(0, row.start2 - args.break_pad), row.stop2 + args.break_pad) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] bc_cov1 = len(get_frag_coverage(frag_phasing, row.chrom1, r1[0], r1[1])) bc_cov2 = len(get_frag_coverage(frag_phasing, row.chrom2, r2[0], r2[1])) if sv_type == tk_readpairs.DEL_STR and max(bc_cov1, bc_cov2) > MAX_DEL_BC_DEPTH: print >> sys.stderr, 'Too many barcodes in DEL candidate', row.chrom1, row.start1, row.stop2 continue readpairs = tk_readpairs.get_readpairs(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) normal_readpairs = [ rp for rp in readpairs if rp.sv_type == tk_readpairs.NORMAL_STR ] if len(normal_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(normal_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(normal_readpairs)) normal_readpairs = [normal_readpairs[ridx] for ridx in sel] # Distal readpairs across the breakpoints dist_readpairs = [ rp for rp in readpairs if rp.sv_type == sv_type and ( (tk_readpairs.pos_overlaps(rp.read1.pos, r1) and tk_readpairs.pos_overlaps(rp.read2.pos, r2)) or (tk_readpairs.pos_overlaps(rp.read1.pos, r2) and tk_readpairs.pos_overlaps(rp.read2.pos, r1))) ] if len(dist_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(dist_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] dist_readpairs.extend(normal_readpairs) if sv_type == tk_readpairs.DEL_STR and len(starts) == 2: more_readpairs = tk_readpairs.get_readpairs(in_bam, [row.chrom1], [r1[1] + 1], [r2[0] - 1], max_insert=max_insert, min_mapq=args.min_mapq, normal_only=True) if len(more_readpairs) > MAX_DEL_READPAIRS: sel = np.random.choice(len(more_readpairs), MAX_DEL_READPAIRS) else: sel = np.arange(len(more_readpairs)) dist_readpairs.extend([ more_readpairs[ridx] for ridx in sel if more_readpairs[ridx].sv_type == tk_readpairs.NORMAL_STR ]) readpairs = sorted(dist_readpairs, key=lambda x: x.barcode) read_groups = {} for bc, read_group_iter in groupby(dist_readpairs, lambda x: x.barcode): read_groups[bc] = list(read_group_iter) bc_set = set(read_groups.keys()) bc_list = sorted(read_groups.keys()) phase_set1 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom1, r1[0], r1[1]) phase_set2 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom2, r2[0], r2[1]) if len(bc_list) < 1: print >> sys.stderr, 'Not enough barcodes. Skipping' continue bc_phase_sets1 = tk_sv_utils.get_barcode_phase_probs(frag_phasing, row.chrom1, r1[0], r1[1], bc_set, in_ps=phase_set1) bc_phase_sets2 = tk_sv_utils.get_barcode_phase_probs(frag_phasing, row.chrom2, r2[0], r2[1], bc_set, in_ps=phase_set2) cand_breaks1 = np.arange(r1[0], r1[1] + 1, 5) cand_breaks2 = np.arange(r2[0], r2[1] + 1, 5) res = tk_readpairs.eval_sv_em(read_groups, cand_breaks1, cand_breaks2, sv_type, chimera_rates, phase_set1, phase_set2, bc_phase_sets1, bc_phase_sets2, max_insert, ins_logsf_fun, em_iters=args.em_iters) ((no_sv_max, sv_max, het_sv_max), max_locus, zygosity, max_hap, prior_hap_probs, hap_probs, support) = res lr = sv_max - no_sv_max if max_hap is None else het_sv_max - no_sv_max hap_probs1 = hap_probs[:, 0:2] hap_probs2 = hap_probs[:, 2:] new_call = sv_call.SvCall.from_em_results( row.chrom1, row.chrom2, phase_set1, phase_set2, (no_sv_max, sv_max, het_sv_max), max_locus, sv_call._SvType(sv_type, ('.', '.')), zygosity, max_hap, support, (hap_probs1, hap_probs2, None)) # the break interval is inclusive if lr >= args.min_lr and new_call.qual >= args.min_qv and new_call.break2[ 0] - new_call.break1[1] + 1 >= args.min_sv_len: pass_calls.append(new_call) else: # Leave breakpoints unchanged new_call.break1 = (row.start1, row.stop1) new_call.break2 = (row.start2, row.stop2) non_pass_calls.append(new_call) out_df = sv_call.SvCall.svs_to_dataframe(pass_calls) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_calls) out_df = sv_call.SvCall.svs_to_dataframe(non_pass_calls) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.non_pass_sv_calls) in_bam.close() frag_phasing.close()
def main(args, outs): bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants) bedpe_df = tk_sv_utils.get_dataframe_loc( bedpe_df, list(range(int(args.start_idx), int(args.stop_idx)))) max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info( args.insert_sizes) if max_insert is None: martian.throw('No Q60 reads') with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary[ 'same_dir_chimera_rate'] chimera_rate_trans = summary['far_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary[ 'outward_dir_chimera_rate'] chimera_rates = { tk_readpairs.DEL_STR: chimera_rate_del, tk_readpairs.INV_STR: chimera_rate_inv, tk_readpairs.TDUP_STR: chimera_rate_dup, tk_readpairs.TRANS_FF_STR: chimera_rate_trans, tk_readpairs.TRANS_FR_STR: chimera_rate_trans, tk_readpairs.TRANS_RR_STR: chimera_rate_trans, tk_readpairs.TRANS_RF_STR: chimera_rate_trans } in_bam = tk_bam.create_bam_infile(args.input) out_quals = [] out_infos = [] out_chroms1 = [] out_starts1 = [] out_stops1 = [] out_chroms2 = [] out_starts2 = [] out_stops2 = [] for i, (_, row) in enumerate(bedpe_df.iterrows()): in_svt = tk_sv_io.get_sv_type(row.info) if row.chrom1 == row.chrom2: max_ext = min(args.break_extend, int( (row.start2 - row.stop1) / 3.0)) r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext) r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend) if r1[1] > r2[0]: starts = [r1[0]] stops = [r2[1]] chroms = [row.chrom1] else: starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] else: r1 = (max(0, row.start1 - args.break_extend), row.stop1 + args.break_extend) r2 = (max(0, row.start2 - args.break_extend), row.stop2 + args.break_extend) starts = [r1[0], r2[0]] stops = [r1[1], r2[1]] chroms = [row.chrom1, row.chrom2] readpairs = tk_readpairs.get_readpairs2(in_bam, chroms, starts, stops, max_insert=max_insert, min_mapq=args.min_mapq) # Distal readpairs across the breakpoints dist_readpairs = filter( filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs) if len(dist_readpairs) > MAX_READPAIRS: sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS) else: sel = np.arange(len(dist_readpairs)) dist_readpairs = [dist_readpairs[ridx] for ridx in sel] res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun, max_insert, chimera_rates) if len(res_arr) == 0: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append(row['info']) else: if args.best_only: res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True) res_arr = [res_arr[0]] for (lr, num_split, num_pairs, sv_len, support_range, svt, support_readpairs) in res_arr: range1, range2 = support_range if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None: out_quals.append(row.qual + args.rp_lr_multiplier * lr) out_chroms1.append(row.chrom1) out_starts1.append(range1[0]) out_stops1.append(range1[1]) out_chroms2.append(row.chrom2) out_starts2.append(range2[0]) out_stops2.append(range2[1]) if svt != in_svt and in_svt != 'TRANS': in_svt = 'UNK' else: out_quals.append(row.qual) out_chroms1.append(row.chrom1) out_starts1.append(row.start1) out_stops1.append(row.stop1) out_chroms2.append(row.chrom2) out_starts2.append(row.start2) out_stops2.append(row.stop2) out_infos.append( tk_sv_io.update_info( row['info'], ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'], [num_pairs, num_split, lr, svt, in_svt])) in_bam.close() if args.best_only: out_names = [n for n in bedpe_df['name']] else: out_names = np.arange(len(bedpe_df)) out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1, out_chroms2, out_starts2, out_stops2, out_names, out_quals, out_infos) tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)