Ejemplo n.º 1
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    frac_changed = np.zeros((len(pred_df), ), dtype=np.float)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        frac_changed[i] = get_frac_mapq_changed(in_bam,
                                                row.chrom1,
                                                max(0, row.start1 - BREAK_EXT),
                                                row.stop1 + BREAK_EXT,
                                                row.chrom2,
                                                max(0, row.start2 - BREAK_EXT),
                                                row.stop2 + BREAK_EXT,
                                                min_mapq=60)

    pileups = pred_df[frac_changed > args.max_frac_low_mapq]
    pred_df = pred_df[frac_changed <= args.max_frac_low_mapq]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
Ejemplo n.º 2
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.renamed_variants)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    if not args.gt_variants is None:
        true_df = prepare_gt(args)
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')
    else:
        true_df = None

    #### Get matches between this chunk of the calls and the ground truth
    max_detect_dist = np.max(np.array(args.detect_dists))
    res = get_matches(pred_df, true_df, max_detect_dist, args.min_rel_overlap)
    pred_to_match, true_to_match, _ = res

    #### Apply filters
    if len(pred_df) > 0:
        # Loading all these files can take awhile. Don't do it if there are no SVs to analyze.

        # blacklist and segdups files can come from 3 places, in this order of precedence:
        # 1. mro argument sv_blacklist_regions
        # 2. <reference_path>/regions/sv_blacklist.bed (or segdups.bed)
        # 3. <tenkit install>/sv_data/<genome>/default_sv_blacklist.bed (accessed by tenkit.constants.find_sv_blacklist)

        if os.path.exists(tk_reference.get_sv_blacklist(args.reference_path)):
            blacklist_file = tk_reference.get_sv_blacklist(args.reference_path)
        else:
            blacklist_file = lr_gt.get_genomic_track(args.sv_blacklist_regions,
                                                     args.blacklist_mode,
                                                     args.reference_path,
                                                     "default_blacklist.bed")

        # This will merge overlapping blacklist regions
        black_regions = tk_sv_utils.bed_to_region_map(blacklist_file,
                                                      merge=True)
        # Match each region in black_regions to a set of entries from the bed
        # file that overlap it. This is done so we can output the names of
        # entries that were used to blacklist each sv.
        black_region_names = get_region_names(blacklist_file, black_regions)
        # compute the distance between the breakpoints and the blacklist
        # elements. Get the distance together with the names of the closest
        # blacklist elements.
        res = get_df_region_dist(pred_df, black_regions, black_region_names)
        black_dists1, black_dists2, _, _, black_names1, black_names2 = res

        if os.path.exists(tk_reference.get_segdups(args.reference_path)):
            seg_dups_file = tk_reference.get_segdups(args.reference_path)
        else:
            seg_dups_file = lr_gt.get_genomic_track(args.seg_dups,
                                                    args.segdup_mode,
                                                    args.reference_path,
                                                    "default_segdups.bedpe")

        # from call to matching seg dups
        seg_dup_calls, _, _ = tk_sv_utils.compare_breaks(
            pred_df, seg_dups_file, max_dist=args.seg_dup_min_dist)
        seg_dup_regions = tk_sv_utils.bedpe_to_region_map(seg_dups_file,
                                                          merge=True)
        all_bad_regions = tk_sv_utils.merge_region_maps(
            black_regions, seg_dup_regions)
    else:
        black_dists1 = None
        black_dists2 = None
        black_names1 = None
        black_names2 = None
        seg_dup_calls = {}
        all_bad_regions = None

    pred_df, min_qv = add_filters(pred_df, pred_to_match, black_dists1,
                                  black_dists2, black_names1, black_names2,
                                  seg_dup_calls, all_bad_regions, args)

    with open(re.sub('.json', '.pickle', outs.summary), 'wb') as f:
        cPickle.dump(pred_to_match, f)
        cPickle.dump(true_to_match, f)
        cPickle.dump((pred_df, min_qv), f)
Ejemplo n.º 3
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']

    has_pileups = np.zeros((len(pred_df), ), dtype=np.bool)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        has_clipped1 = has_too_many_clipped(
            in_bam,
            row.chrom1,
            max(0, row.start1 - BREAK_EXT),
            row.stop1 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped2 = has_too_many_clipped(
            in_bam,
            row.chrom2,
            max(0, row.start2 - BREAK_EXT),
            row.stop2 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped = has_clipped1 and has_clipped2

        if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE:
            has_pileups[i] = has_clipped
            continue

        cov = cov_reader.query(
            (row.chrom1, max(0,
                             row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT))
        cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int)
        if not 'coverage_deduped' in cov.columns:
            cov['coverage_deduped'] = cov[sel_cols].sum(axis=1)
        cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped'])
        median_cov = np.median(cov_arr)

        # Rescue for deletions or duplications with breakpoints on the pileups
        sv_len = row.stop2 - row.start1
        side_cov = cov_reader.query(
            (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2),
             row.start1 - BREAK_EXT))
        side_cov = pd.concat([
            side_cov,
            cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT,
                              row.stop2 + BREAK_EXT + sv_len / 2))
        ],
                             ignore_index=True)
        if not 'coverage_deduped' in side_cov.columns:
            side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1)

        # Ignore pileups, enough evidence for a large-scale copy number variant
        if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue
        if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue

        # Filter out the call if there are pileups very close to the breakpoints
        has_pileups[i] = len(cov_arr) > 4 and np.any(
            cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov)
        has_pileups[i] = has_pileups[i] or has_clipped

    pileups = pred_df[has_pileups]
    pred_df = pred_df[np.logical_not(has_pileups)]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
Ejemplo n.º 4
0
def main(args, outs):
    sv_df = read_bedpes(args)
    sv_df = tk_sv_utils.get_dataframe_loc(
        sv_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    print >> sys.stderr, 'max insert', max_insert

    if max_insert is None:
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.non_pass_sv_calls)
        return

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']
    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_STR: summary['far_chimera_rate']
    }

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)

    pass_calls = []
    non_pass_calls = []

    for i, (_, row) in enumerate(sv_df.iterrows()):
        sv_type = tk_sv_io.get_sv_type(row.info)

        middle = int(0.5 * (row.stop1 + row.start2))

        # Bail out on all non deletions
        if sv_type != tk_readpairs.DEL_STR:
            continue

        if row.chrom1 == row.chrom2:
            r1 = (max(0, row.start1 - args.break_pad),
                  min(middle, row.stop1 + args.break_pad))
            r2 = (max(middle,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)

            if row.start2 - row.stop1 > 4 * args.break_pad:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
            else:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
        else:
            r1 = (max(0,
                      row.start1 - args.break_pad), row.stop1 + args.break_pad)
            r2 = (max(0,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        bc_cov1 = len(get_frag_coverage(frag_phasing, row.chrom1, r1[0],
                                        r1[1]))
        bc_cov2 = len(get_frag_coverage(frag_phasing, row.chrom2, r2[0],
                                        r2[1]))
        if sv_type == tk_readpairs.DEL_STR and max(bc_cov1,
                                                   bc_cov2) > MAX_DEL_BC_DEPTH:
            print >> sys.stderr, 'Too many barcodes in DEL candidate', row.chrom1, row.start1, row.stop2
            continue

        readpairs = tk_readpairs.get_readpairs(in_bam,
                                               chroms,
                                               starts,
                                               stops,
                                               max_insert=max_insert,
                                               min_mapq=args.min_mapq)

        normal_readpairs = [
            rp for rp in readpairs if rp.sv_type == tk_readpairs.NORMAL_STR
        ]
        if len(normal_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(normal_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(normal_readpairs))
        normal_readpairs = [normal_readpairs[ridx] for ridx in sel]

        # Distal readpairs across the breakpoints
        dist_readpairs = [
            rp for rp in readpairs if rp.sv_type == sv_type and (
                (tk_readpairs.pos_overlaps(rp.read1.pos, r1)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r2)) or
                (tk_readpairs.pos_overlaps(rp.read1.pos, r2)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r1)))
        ]
        if len(dist_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(dist_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        dist_readpairs.extend(normal_readpairs)
        if sv_type == tk_readpairs.DEL_STR and len(starts) == 2:
            more_readpairs = tk_readpairs.get_readpairs(in_bam, [row.chrom1],
                                                        [r1[1] + 1],
                                                        [r2[0] - 1],
                                                        max_insert=max_insert,
                                                        min_mapq=args.min_mapq,
                                                        normal_only=True)
            if len(more_readpairs) > MAX_DEL_READPAIRS:
                sel = np.random.choice(len(more_readpairs), MAX_DEL_READPAIRS)
            else:
                sel = np.arange(len(more_readpairs))
            dist_readpairs.extend([
                more_readpairs[ridx] for ridx in sel
                if more_readpairs[ridx].sv_type == tk_readpairs.NORMAL_STR
            ])

        readpairs = sorted(dist_readpairs, key=lambda x: x.barcode)
        read_groups = {}
        for bc, read_group_iter in groupby(dist_readpairs,
                                           lambda x: x.barcode):
            read_groups[bc] = list(read_group_iter)

        bc_set = set(read_groups.keys())
        bc_list = sorted(read_groups.keys())
        phase_set1 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom1, r1[0],
                                               r1[1])
        phase_set2 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom2, r2[0],
                                               r2[1])

        if len(bc_list) < 1:
            print >> sys.stderr, 'Not enough barcodes. Skipping'
            continue

        bc_phase_sets1 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom1,
                                                             r1[0],
                                                             r1[1],
                                                             bc_set,
                                                             in_ps=phase_set1)
        bc_phase_sets2 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom2,
                                                             r2[0],
                                                             r2[1],
                                                             bc_set,
                                                             in_ps=phase_set2)

        cand_breaks1 = np.arange(r1[0], r1[1] + 1, 5)
        cand_breaks2 = np.arange(r2[0], r2[1] + 1, 5)

        res = tk_readpairs.eval_sv_em(read_groups,
                                      cand_breaks1,
                                      cand_breaks2,
                                      sv_type,
                                      chimera_rates,
                                      phase_set1,
                                      phase_set2,
                                      bc_phase_sets1,
                                      bc_phase_sets2,
                                      max_insert,
                                      ins_logsf_fun,
                                      em_iters=args.em_iters)

        ((no_sv_max, sv_max, het_sv_max), max_locus, zygosity, max_hap,
         prior_hap_probs, hap_probs, support) = res

        lr = sv_max - no_sv_max if max_hap is None else het_sv_max - no_sv_max

        hap_probs1 = hap_probs[:, 0:2]
        hap_probs2 = hap_probs[:, 2:]

        new_call = sv_call.SvCall.from_em_results(
            row.chrom1, row.chrom2, phase_set1, phase_set2,
            (no_sv_max, sv_max, het_sv_max), max_locus,
            sv_call._SvType(sv_type, ('.', '.')), zygosity, max_hap, support,
            (hap_probs1, hap_probs2, None))

        # the break interval is inclusive
        if lr >= args.min_lr and new_call.qual >= args.min_qv and new_call.break2[
                0] - new_call.break1[1] + 1 >= args.min_sv_len:
            pass_calls.append(new_call)
        else:
            # Leave breakpoints unchanged
            new_call.break1 = (row.start1, row.stop1)
            new_call.break2 = (row.start2, row.stop2)
            non_pass_calls.append(new_call)

    out_df = sv_call.SvCall.svs_to_dataframe(pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_calls)

    out_df = sv_call.SvCall.svs_to_dataframe(non_pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.non_pass_sv_calls)
    in_bam.close()
    frag_phasing.close()
Ejemplo n.º 5
0
def main(args, outs):

    bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    bedpe_df = tk_sv_utils.get_dataframe_loc(
        bedpe_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    if max_insert is None:
        martian.throw('No Q60 reads')

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_trans = summary['far_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']

    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_FF_STR: chimera_rate_trans,
        tk_readpairs.TRANS_FR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RF_STR: chimera_rate_trans
    }

    in_bam = tk_bam.create_bam_infile(args.input)

    out_quals = []
    out_infos = []
    out_chroms1 = []
    out_starts1 = []
    out_stops1 = []
    out_chroms2 = []
    out_starts2 = []
    out_stops2 = []

    for i, (_, row) in enumerate(bedpe_df.iterrows()):
        in_svt = tk_sv_io.get_sv_type(row.info)

        if row.chrom1 == row.chrom2:
            max_ext = min(args.break_extend, int(
                (row.start2 - row.stop1) / 3.0))
            r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext)
            r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend)
            if r1[1] > r2[0]:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
            else:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
        else:
            r1 = (max(0, row.start1 - args.break_extend),
                  row.stop1 + args.break_extend)
            r2 = (max(0, row.start2 - args.break_extend),
                  row.stop2 + args.break_extend)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        readpairs = tk_readpairs.get_readpairs2(in_bam,
                                                chroms,
                                                starts,
                                                stops,
                                                max_insert=max_insert,
                                                min_mapq=args.min_mapq)

        # Distal readpairs across the breakpoints
        dist_readpairs = filter(
            filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs)

        if len(dist_readpairs) > MAX_READPAIRS:
            sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun,
                                         max_insert, chimera_rates)

        if len(res_arr) == 0:
            out_quals.append(row.qual)
            out_chroms1.append(row.chrom1)
            out_starts1.append(row.start1)
            out_stops1.append(row.stop1)
            out_chroms2.append(row.chrom2)
            out_starts2.append(row.start2)
            out_stops2.append(row.stop2)
            out_infos.append(row['info'])
        else:
            if args.best_only:
                res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True)
                res_arr = [res_arr[0]]

            for (lr, num_split, num_pairs, sv_len, support_range, svt,
                 support_readpairs) in res_arr:
                range1, range2 = support_range
                if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None:
                    out_quals.append(row.qual + args.rp_lr_multiplier * lr)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(range1[0])
                    out_stops1.append(range1[1])
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(range2[0])
                    out_stops2.append(range2[1])
                    if svt != in_svt and in_svt != 'TRANS':
                        in_svt = 'UNK'
                else:
                    out_quals.append(row.qual)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(row.start1)
                    out_stops1.append(row.stop1)
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(row.start2)
                    out_stops2.append(row.stop2)

                out_infos.append(
                    tk_sv_io.update_info(
                        row['info'],
                        ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'],
                        [num_pairs, num_split, lr, svt, in_svt]))

    in_bam.close()

    if args.best_only:
        out_names = [n for n in bedpe_df['name']]
    else:
        out_names = np.arange(len(bedpe_df))

    out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1,
                                   out_chroms2, out_starts2, out_stops2,
                                   out_names, out_quals, out_infos)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)