コード例 #1
0
def split(args):
    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.variants)
    gt_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants)
    tk_sv_io.check_sv_names(gt_df)

    sv_df["name"] = ["call_%d" % idx for idx in range(len(sv_df))]

    variants_bedpe = os.path.join(os.getcwd(), "variants.bedpe")
    tk_sv_io.write_sv_df_to_bedpe(sv_df, variants_bedpe)

    nsvs = sv_df.shape[0]
    nbreaks_per_chunk = max(100,
                            int(np.ceil(nsvs / 32.0)))  # avoid overchunking
    nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk)))
    chunk_defs = []

    for i in range(nchunks):
        chunk_start = i * nbreaks_per_chunk
        chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk)
        chunk_defs.append({
            'renamed_variants': variants_bedpe,
            'start_idx': chunk_start,
            'stop_idx': chunk_end,
            '__mem_gb': 12
        })

    if len(chunk_defs) == 0:
        chunk_defs = [{
            'renamed_variants': variants_bedpe,
            'start_idx': 0,
            'stop_idx': 0,
            '__mem_gb': 12
        }]

    return {'chunks': chunk_defs, 'join': {'__mem_gb': 16}}
コード例 #2
0
def read_bedpes(args):
    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    if not args.sv_calls2 is None:
        sv_df = pd.concat(
            [sv_df, tk_sv_io.read_sv_bedpe_to_df(args.sv_calls2)],
            ignore_index=True)
        sv_df['name'] = np.arange(len(sv_df))
    return sv_df
コード例 #3
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    out_calls = None
    out_pileups = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_calls):
            continue
        calls = tk_sv_io.read_sv_bedpe_to_df(c.sv_calls)
        pileups = tk_sv_io.read_sv_bedpe_to_df(c.pileups)
        out_calls = pd.concat([out_calls, calls], ignore_index=True)
        out_pileups = pd.concat([out_pileups, pileups], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(out_calls, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_pileups, outs.pileups)
コード例 #4
0
ファイル: __init__.py プロジェクト: umccr/longranger
def prepare_loci(args):
    """Merge and sort input lists of candidate loci."""

    overlap_loci = []

    # Loci based on barcode overlaps. Type of SV is unknown.
    if not args.overlap_loci is None:
        with open(args.overlap_loci, 'rb') as f:
            loci = cPickle.load(f)
        overlap_loci.extend([(x[0], x[1], x[2], x[3], x[4], x[5], None) for x in loci])

    # Low depth loci. These will only be evaluated for deletions.
    if not args.low_depth_loci is None:
        del_calls = tk_sv_io.read_sv_bedpe_to_df(args.low_depth_loci)
        for _, row in del_calls.iterrows():
            overlap_loci.append((row.chrom1, row.start1, row.stop1,
                                 row.chrom2, row.start2, row.stop2, 'DEL'))

    # Loci based on read-pair support. These will only be evaluated for the
    # type of SV supported by the readpairs.
    if not args.rp_calls is None:
        rp_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls)
        for _, row in rp_calls.iterrows():
            sv_type = tk_sv_io.get_sv_type(row.info)
            if not sv_type in ['DEL', 'INV', 'DUP']:
                sv_type = None
            else:
                sv_type = [sv_type]
            overlap_loci.append((row.chrom1, row.start1, row.stop1,
                                 row.chrom2, row.start2, row.stop2, sv_type))

    # Sort by position and also get the sorted indices.
    sorted_overlap_loci = sorted(overlap_loci,
                                 key=lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
    sorted_overlap_loci_idx = sorted(range(len(overlap_loci)),
                                     key=lambda x: (overlap_loci[x][0], overlap_loci[x][1],
                                                    overlap_loci[x][2], overlap_loci[x][3],
                                                    overlap_loci[x][4], overlap_loci[x][5]))

    # If there is a single source of candidate loci, coming from a BEDPE, then
    # keep track of the names in the BEDPE, so you can annotate SV-calls made
    # with the BEDPE line from which they came.
    if args.overlap_loci is None and args.low_depth_loci is None and not args.rp_calls is None:
        input_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls)
        input_names = list(input_calls['name'])
        input_names = [input_names[n] for n in sorted_overlap_loci_idx]
    else:
        input_names = None

    return sorted_overlap_loci, input_names
コード例 #5
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    non_pass_join_df = None
    for chunk in chunk_outs:
        df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        non_pass_df = tk_sv_io.read_sv_bedpe_to_df(chunk.non_pass_sv_calls)
        join_df = pd.concat([join_df, df], ignore_index=True)
        non_pass_join_df = pd.concat([non_pass_join_df, non_pass_df],
                                     ignore_index=True)

    join_df['name'] = np.arange(len(join_df))
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
    non_pass_join_df['name'] = np.arange(len(join_df),
                                         len(join_df) + len(non_pass_join_df))
    tk_sv_io.write_sv_df_to_bedpe(non_pass_join_df, outs.non_pass_sv_calls)
コード例 #6
0
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    for chunk in chunk_outs:
        df = sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        join_df = pd.concat([join_df, df], ignore_index = True)

    sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)
コード例 #7
0
def split(args):
    sv_df = sv_io.read_sv_bedpe_to_df(args.candidates)

    if len(sv_df) == 0:
        chunk_defs = [{'chunk_bed': None}]
        return {'chunks': chunk_defs}

    sv_df["size"] = sv_df["stop2"] - sv_df["start1"]
    sv_df["padding"] = np.round(sv_df["size"] * args.padding_fract + args.padding_abs).astype(np.int32)
    sv_df["new_start"] = np.maximum(0, sv_df["start1"] - sv_df["padding"])
    sv_df["new_stop"] = sv_df["stop2"] + sv_df["padding"]
    sv_df["new_size"] = sv_df["new_stop"] - sv_df["new_start"]
    sv_df = sv_df[(sv_df["new_size"] < 5000)]

    sv_df.sort_values(by=['chrom1', 'start1'], inplace=True)

    nsvs = sv_df.shape[0]
    nsvs_per_chunk = max(500, int(np.ceil(nsvs / 128.0)))
    nchunks = int(np.ceil(nsvs / float(nsvs_per_chunk)))
    chunk_defs = []

    for i in range(nchunks):
        chunk_start = i * nsvs_per_chunk
        chunk_end =  min(nsvs, (i + 1) * nsvs_per_chunk)
        subset = sv_df[chunk_start:chunk_end][["chrom1", "new_start", "new_stop"]]

        # Figure out correct padding
        fn = os.path.join(os.getcwd(), "chunk_%d.bed" % i)
        subset.to_csv(fn, header=False, sep="\t", index=False)
        chunk_defs.append({'chunk_bed': fn, "__mem_gb": 3.0})


    return {'chunks': chunk_defs}
コード例 #8
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    out_df = None
    for chunk in chunk_outs:
        tmp_df = tk_sv_io.read_sv_bedpe_to_df(chunk.del_candidates)
        out_df = pd.concat([out_df, tmp_df], ignore_index=True)

    out_df['name'] = np.arange(len(out_df))
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.del_candidates)
コード例 #9
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    callsets = []

    if args.calls1 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls1)
        callsets.append(c)

    if args.calls2 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls2)
        callsets.append(c)

    if args.calls3 is not None:
        c = sv_io.read_sv_bedpe_to_df(args.calls3)
        callsets.append(c)

    # Select the highest qual
    merged = merge_overlapping(callsets, select_widest())
    sv_io.write_sv_df_to_bedpe(merged, outs.merged)
コード例 #10
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_variants)
        join_df = pd.concat([join_df, bedpe_df], ignore_index=True)

    if not args.best_only:
        join_df['name'] = np.arange(len(join_df))

    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_variants)
コード例 #11
0
ファイル: utils.py プロジェクト: umccr/longranger
def overlap_breaks(pred_loci, true_loci, min_rel_overlap=0.5):
    if not isinstance(pred_loci, pd.DataFrame):
        pred_df = tk_sv_io.read_sv_bedpe_to_df(pred_loci)
    else:
        pred_df = pred_loci

    if not isinstance(true_loci, pd.DataFrame):
        gt_df = tk_sv_io.read_sv_bedpe_to_df(true_loci)
    else:
        gt_df = true_loci

    gt_df.index = gt_df['name']
    gt_map = bedpe_df_to_named_region_map(gt_df)

    pred_to_matching_true = defaultdict(set)
    true_to_matching_pred = defaultdict(set)

    for _, row in pred_df.iterrows():
        if not row.chrom1 in gt_map:
            continue
        matches = gt_map[row.chrom1].overlapping_region_names(
            row.start1, row.stop2)
        this_len = row.stop2 - row.start1
        if len(matches) > 0:
            match_df = gt_df.loc[list(matches)]
            lengths = np.array(match_df.stop2 - match_df.start1,
                               dtype=np.float)
            ov = np.minimum(match_df.stop2, row.stop2) - np.maximum(
                match_df.start1, row.start1)
            good_matches = np.logical_and(
                ov / float(this_len) > min_rel_overlap,
                ov / lengths > min_rel_overlap)

            if np.any(good_matches):
                true_names = match_df[good_matches]['name']
                for matching_true in true_names:
                    true_to_matching_pred[matching_true].add(row['name'])
                    pred_to_matching_true[row['name']].add(matching_true)

    return (pred_to_matching_true, true_to_matching_pred, {})
コード例 #12
0
def main(args, outs):

    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    sv_df["info2"] = "SV"

    cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants)
    cnv_df["info2"] = "CNV"

    sv_df = pd.concat([sv_df, cnv_df], ignore_index=True)
    sv_df['name'] = np.arange(len(sv_df))
    sv_df.sort(['chrom1', 'chrom2'], inplace=True)

    res_df = None
    for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']):
        tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
                    inplace=True)
        # cluster the loci in the group based on proximity
        groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist)

        # for each cluster, get the row with max qual
        # tmp_df.loc[g] gets the subset of tmp_df in the cluster.
        # then idxmax gets the max index

        out_df = pd.DataFrame(columns=sv_df.columns)
        idx = 0
        for g in groups:
            row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()]
            if (tmp_df.loc[g]['info2'] == 'SV').any():
                row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()]

            source = list(set(tmp_df.loc[g]['info2']))
            row['info'] += (";SOURCE=" + ",".join(source))
            out_df.loc[idx] = row
            idx += 1

        out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True)
        res_df = pd.concat([res_df, out_df], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
コード例 #13
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if not out_bedpe is None:
        out_bedpe['name'] = np.arange(len(out_bedpe))
    sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)

    if chunk_outs[0] is not None and os.path.exists(chunk_outs[0].summary):
        shutil.copyfile(chunk_outs[0].summary, outs.summary)
    else:
        outs.summary = None
コード例 #14
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if out_bedpe is None:
        col_names = ['chrom1', 'start1', 'stop1',
                     'chrom2', 'start2', 'stop2', 'name', 'qual',
                     'strand1', 'strand2', 'filters', 'info']
        out_bedpe = pd.DataFrame(columns=col_names)
    out_bedpe.names = np.arange(len(out_bedpe))

    out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv]
    tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
コード例 #15
0
ファイル: utils.py プロジェクト: umccr/longranger
def merge_multiple_breaks(in_bedpes,
                          out_bedpe,
                          merge_win=10000,
                          max_range=np.inf):
    assert (len(in_bedpes) > 0)
    in_bedpe_df = None
    for bi, bedpe in enumerate(in_bedpes):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe)
        assert (bedpe_df.shape[1] > 11)
        bedpe_df = bedpe_df.iloc[:, 0:12]
        # Make sure that all names from all files are unique
        bedpe_df['name'] = [str(n) + '_' + str(bi) for n in bedpe_df['name']]
        in_bedpe_df = pd.concat([in_bedpe_df, bedpe_df], ignore_index=True)

    return merge_breaks(in_bedpe_df,
                        out_bedpe,
                        merge_win=merge_win,
                        max_range=max_range)
コード例 #16
0
ファイル: utils.py プロジェクト: umccr/longranger
def get_break_groups(bedpe_df, merge_win=10000, max_range=np.inf):
    """A simplified version of merge_breaks"""

    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)

    breaks = []
    for i, (n, row) in enumerate(bedpe_df.iterrows()):
        breaks.append((row.chrom1, row.start1, row.stop1, (n, 1)))
        breaks.append((row.chrom2, row.start2, row.stop2, (n, 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = defaultdict(list)
    for i, (n, row) in enumerate(bedpe_df.iterrows()):
        cluster_idx1 = mem_to_cluster[(n, 1)]
        cluster_idx2 = mem_to_cluster[(n, 2)]
        cluster_pairs[(cluster_idx1, cluster_idx2)].append(n)
    return cluster_pairs.values()
コード例 #17
0
ファイル: __init__.py プロジェクト: umccr/longranger
def split(args):
    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)

    nsvs = sv_df.shape[0]
    nbreaks_per_chunk = max(20,
                            int(np.ceil(nsvs / 100.0)))  # avoid overchunking
    nchunks = int(np.ceil(nsvs / float(nbreaks_per_chunk)))
    chunk_defs = []

    for i in range(nchunks):
        chunk_start = i * nbreaks_per_chunk
        chunk_end = min(nsvs, (i + 1) * nbreaks_per_chunk)
        chunk_defs.append({
            'start_idx': chunk_start,
            'stop_idx': chunk_end,
            '__mem_gb': 16
        })
    if len(chunk_defs) == 0:
        chunk_defs = [{'start_idx': 0, 'stop_idx': 0}]
    return {'chunks': chunk_defs}
コード例 #18
0
ファイル: __init__.py プロジェクト: umccr/longranger
def join(args, outs, chunk_defs, chunk_outs):
    join_df = None
    read_counts = {}
    read_counts['split'] = defaultdict(int)
    read_counts['pair'] = defaultdict(int)

    for chunk in chunk_outs:
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(chunk.sv_calls)
        join_df = pd.concat([join_df, bedpe_df], ignore_index = True)

        if not os.path.isfile(chunk.discordant_read_counts):
            continue
        with open(chunk.discordant_read_counts, 'r') as f:
            counts = json.load(f)
        for t, c in counts['split'].iteritems():
            read_counts['split'][t] += c
        for t, c in counts['pair'].iteritems():
            read_counts['pair'][t] += c

    join_df['name'] = [str(i) for i in np.arange(len(join_df))]
    tk_sv_io.write_sv_df_to_bedpe(join_df, outs.sv_calls)

    read_counts['split'] = dict(read_counts['split'])
    read_counts['pair'] = dict(read_counts['pair'])

    with open(args.basic_summary, 'r') as f:
        num_reads = float(json.load(f)['num_reads']) / 2.0

    read_counts['frac_split'] = {}
    read_counts['frac_pair'] = {}
    for t, c in read_counts['split'].iteritems():
        read_counts['frac_split'][t] = c / num_reads
    for t, c in read_counts['pair'].iteritems():
        read_counts['frac_pair'][t] = c / num_reads

    with open(outs.discordant_read_counts, 'w') as f:
        f.write(tenkit.safe_json.safe_jsonify(read_counts))
コード例 #19
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    frac_changed = np.zeros((len(pred_df), ), dtype=np.float)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        frac_changed[i] = get_frac_mapq_changed(in_bam,
                                                row.chrom1,
                                                max(0, row.start1 - BREAK_EXT),
                                                row.stop1 + BREAK_EXT,
                                                row.chrom2,
                                                max(0, row.start2 - BREAK_EXT),
                                                row.stop2 + BREAK_EXT,
                                                min_mapq=60)

    pileups = pred_df[frac_changed > args.max_frac_low_mapq]
    pred_df = pred_df[frac_changed <= args.max_frac_low_mapq]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)
コード例 #20
0
def prepare_gt(args):
    if args.gt_variants is None:
        return None

    true_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants)

    # Length of ground truth sv
    true_df['dist'] = tk_sv_io.get_sv_df_dists(true_df)

    sv_types = []
    orients = []
    tiers = []

    # Mark genic SVs
    is_genic1 = np.zeros((len(true_df), ), dtype=np.int)
    is_genic2 = np.zeros((len(true_df), ), dtype=np.int)
    gene_regions = tk_reference.load_gene_boundaries(args.reference_path,
                                                     protein_coding=False)

    for row_idx, (_, row) in enumerate(true_df.iterrows()):
        if not 'info' in true_df.columns:
            sv_types.append('UNK')
            orients.append('..')
            tiers.append(0)
        else:
            sv_type = tk_sv_io.get_sv_type(row.info)
            if sv_type == 'DISTAL':
                sv_type = 'TRANS'
            sv_types.append(sv_type)
            orients.append(tk_sv_io.get_break_orientation(row.info))
            tiers.append(tk_sv_io.get_tier(row.info))

        is_genic1[row_idx] = int(
            row.chrom1 in gene_regions
            and bool(gene_regions[row.chrom1].overlapping_regions(
                row.start1, row.stop1)))
        is_genic2[row_idx] = int(
            row.chrom2 in gene_regions
            and bool(gene_regions[row.chrom2].overlapping_regions(
                row.start2, row.stop2)))

    true_df['break1_genic'] = is_genic1
    true_df['break2_genic'] = is_genic2
    # number of breakpoints overlapping genes
    true_df['genic_breaks'] = is_genic1 + is_genic2

    # put all the un-tiered entries into the last tier
    tiers = np.array(tiers)
    if len(tiers) == 0:
        total_tiers = 0
    else:
        total_tiers = np.max(tiers)

    tiers[tiers == 0] = total_tiers + 1

    true_df['tier'] = tiers
    true_df['sv_type'] = sv_types
    true_df['orient'] = orients

    if not args.min_sv_len is None:
        # Select only intra-chromosomal or svs that have a minimum distance between breakpoints
        is_feasible = np.array(true_df['dist'] >= args.min_sv_len,
                               dtype=np.bool)

    if not args.targets is None and not args.target_dists is None:
        target_regions = tk_sv_utils.bed_to_region_map(args.targets,
                                                       merge=True)
        res = get_df_region_dist(true_df, target_regions, use_orient=True)
        targ_dists1, targ_dists2, targs1, targs2, _, _ = res

        new_starts1 = np.array(true_df.start1)
        new_stops1 = np.array(true_df.stop1)
        new_starts2 = np.array(true_df.start2)
        new_stops2 = np.array(true_df.stop2)

        for i, (t1, t2) in enumerate(zip(targs1, targs2)):
            if not t1[0] is None and not t2[0] is None:
                new_starts1[i], new_stops1[i] = t1
                new_starts2[i], new_stops2[i] = t2

        true_df['start1'] = new_starts1
        true_df['stop1'] = new_stops1
        true_df['start2'] = new_starts2
        true_df['stop2'] = new_stops2

        true_df['targ_dist'] = np.maximum(np.array(targ_dists1),
                                          np.array(targ_dists2))
    else:
        true_df['targ_dist'] = np.zeros((len(true_df), ), dtype=np.int)

    true_df['feasible'] = is_feasible

    return true_df
コード例 #21
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):

    bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    bedpe_df = tk_sv_utils.get_dataframe_loc(
        bedpe_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    if max_insert is None:
        martian.throw('No Q60 reads')

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_trans = summary['far_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']

    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_FF_STR: chimera_rate_trans,
        tk_readpairs.TRANS_FR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RF_STR: chimera_rate_trans
    }

    in_bam = tk_bam.create_bam_infile(args.input)

    out_quals = []
    out_infos = []
    out_chroms1 = []
    out_starts1 = []
    out_stops1 = []
    out_chroms2 = []
    out_starts2 = []
    out_stops2 = []

    for i, (_, row) in enumerate(bedpe_df.iterrows()):
        in_svt = tk_sv_io.get_sv_type(row.info)

        if row.chrom1 == row.chrom2:
            max_ext = min(args.break_extend, int(
                (row.start2 - row.stop1) / 3.0))
            r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext)
            r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend)
            if r1[1] > r2[0]:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
            else:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
        else:
            r1 = (max(0, row.start1 - args.break_extend),
                  row.stop1 + args.break_extend)
            r2 = (max(0, row.start2 - args.break_extend),
                  row.stop2 + args.break_extend)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        readpairs = tk_readpairs.get_readpairs2(in_bam,
                                                chroms,
                                                starts,
                                                stops,
                                                max_insert=max_insert,
                                                min_mapq=args.min_mapq)

        # Distal readpairs across the breakpoints
        dist_readpairs = filter(
            filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs)

        if len(dist_readpairs) > MAX_READPAIRS:
            sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun,
                                         max_insert, chimera_rates)

        if len(res_arr) == 0:
            out_quals.append(row.qual)
            out_chroms1.append(row.chrom1)
            out_starts1.append(row.start1)
            out_stops1.append(row.stop1)
            out_chroms2.append(row.chrom2)
            out_starts2.append(row.start2)
            out_stops2.append(row.stop2)
            out_infos.append(row['info'])
        else:
            if args.best_only:
                res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True)
                res_arr = [res_arr[0]]

            for (lr, num_split, num_pairs, sv_len, support_range, svt,
                 support_readpairs) in res_arr:
                range1, range2 = support_range
                if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None:
                    out_quals.append(row.qual + args.rp_lr_multiplier * lr)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(range1[0])
                    out_stops1.append(range1[1])
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(range2[0])
                    out_stops2.append(range2[1])
                    if svt != in_svt and in_svt != 'TRANS':
                        in_svt = 'UNK'
                else:
                    out_quals.append(row.qual)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(row.start1)
                    out_stops1.append(row.stop1)
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(row.start2)
                    out_stops2.append(row.stop2)

                out_infos.append(
                    tk_sv_io.update_info(
                        row['info'],
                        ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'],
                        [num_pairs, num_split, lr, svt, in_svt]))

    in_bam.close()

    if args.best_only:
        out_names = [n for n in bedpe_df['name']]
    else:
        out_names = np.arange(len(bedpe_df))

    out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1,
                                   out_chroms2, out_starts2, out_stops2,
                                   out_names, out_quals, out_infos)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)
コード例 #22
0
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.renamed_variants)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    if not args.gt_variants is None:
        true_df = prepare_gt(args)
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')
    else:
        true_df = None

    #### Get matches between this chunk of the calls and the ground truth
    max_detect_dist = np.max(np.array(args.detect_dists))
    res = get_matches(pred_df, true_df, max_detect_dist, args.min_rel_overlap)
    pred_to_match, true_to_match, _ = res

    #### Apply filters
    if len(pred_df) > 0:
        # Loading all these files can take awhile. Don't do it if there are no SVs to analyze.

        # blacklist and segdups files can come from 3 places, in this order of precedence:
        # 1. mro argument sv_blacklist_regions
        # 2. <reference_path>/regions/sv_blacklist.bed (or segdups.bed)
        # 3. <tenkit install>/sv_data/<genome>/default_sv_blacklist.bed (accessed by tenkit.constants.find_sv_blacklist)

        if os.path.exists(tk_reference.get_sv_blacklist(args.reference_path)):
            blacklist_file = tk_reference.get_sv_blacklist(args.reference_path)
        else:
            blacklist_file = lr_gt.get_genomic_track(args.sv_blacklist_regions,
                                                     args.blacklist_mode,
                                                     args.reference_path,
                                                     "default_blacklist.bed")

        # This will merge overlapping blacklist regions
        black_regions = tk_sv_utils.bed_to_region_map(blacklist_file,
                                                      merge=True)
        # Match each region in black_regions to a set of entries from the bed
        # file that overlap it. This is done so we can output the names of
        # entries that were used to blacklist each sv.
        black_region_names = get_region_names(blacklist_file, black_regions)
        # compute the distance between the breakpoints and the blacklist
        # elements. Get the distance together with the names of the closest
        # blacklist elements.
        res = get_df_region_dist(pred_df, black_regions, black_region_names)
        black_dists1, black_dists2, _, _, black_names1, black_names2 = res

        if os.path.exists(tk_reference.get_segdups(args.reference_path)):
            seg_dups_file = tk_reference.get_segdups(args.reference_path)
        else:
            seg_dups_file = lr_gt.get_genomic_track(args.seg_dups,
                                                    args.segdup_mode,
                                                    args.reference_path,
                                                    "default_segdups.bedpe")

        # from call to matching seg dups
        seg_dup_calls, _, _ = tk_sv_utils.compare_breaks(
            pred_df, seg_dups_file, max_dist=args.seg_dup_min_dist)
        seg_dup_regions = tk_sv_utils.bedpe_to_region_map(seg_dups_file,
                                                          merge=True)
        all_bad_regions = tk_sv_utils.merge_region_maps(
            black_regions, seg_dup_regions)
    else:
        black_dists1 = None
        black_dists2 = None
        black_names1 = None
        black_names2 = None
        seg_dup_calls = {}
        all_bad_regions = None

    pred_df, min_qv = add_filters(pred_df, pred_to_match, black_dists1,
                                  black_dists2, black_names1, black_names2,
                                  seg_dup_calls, all_bad_regions, args)

    with open(re.sub('.json', '.pickle', outs.summary), 'wb') as f:
        cPickle.dump(pred_to_match, f)
        cPickle.dump(true_to_match, f)
        cPickle.dump((pred_df, min_qv), f)
コード例 #23
0
ファイル: utils.py プロジェクト: umccr/longranger
def compare_multiple_breaks(in_bedpes,
                            sample_names,
                            out_bedpe,
                            merge_win=0,
                            max_range=np.inf):
    """Compares multiple BEDPE files.
    Args:
    - in_bedpes: A list of BEDPE files to compare.
    - sample_names: A list of the same size with unique names for the input samples.
    - out_bedpe: Where union BEDPE will be written.

    Return value:
    A DataFrame with the union of calls and information about which
    calls are present in which input files. This DataFrame will have one entry per call in the
    union and will include (among other things) columns <sample>_qual, <sample>_filtered,
    <sample>_correct, and <sample>_dist for each of the input BEDPEs.
    """

    assert (len(sample_names) == len(in_bedpes))

    # Merge all the input files. This will get rid of redundant entries.
    # The quality in the output will be the maximum quality across all files.
    merged_df = merge_multiple_breaks(in_bedpes,
                                      out_bedpe,
                                      merge_win=merge_win,
                                      max_range=max_range)
    num_merged = len(merged_df)

    # Map the name of each entry in the union to its index in the DataFrame.
    name_to_ind = {}
    for i, n in enumerate(merged_df['name']):
        name_to_ind[n] = i

    new_filters = [set([]) for i in range(num_merged)]
    new_matches = [set([]) for i in range(num_merged)]

    # For each of the input BEDPEs find which of the entries in the union it
    # overlaps. This is somewhat duplicated work, but it's simpler this way.
    for sample, bedpe in zip(sample_names, in_bedpes):
        in_df = tk_sv_io.read_sv_bedpe_to_df(bedpe)
        name_to_ind2 = {}
        for i, n in enumerate(in_df['name']):
            name_to_ind2[n] = i

        matched_qual = np.zeros((num_merged, ), dtype=np.int)
        is_correct = np.zeros((num_merged, ), dtype=np.bool)
        is_filtered = np.zeros((num_merged, ), dtype=np.bool)
        tmp_dist = np.zeros((num_merged, ), dtype=np.int)
        matched_names = ['' for i in range(num_merged)]

        # merged_to_this will be a dictionary from a name in the union to a set
        # of names in the input bedpe
        merged_to_this, _, _ = compare_breaks(merged_df,
                                              bedpe,
                                              max_dist=merge_win)
        for name1, name2_set in merged_to_this.iteritems():
            ind1 = name_to_ind[name1]
            matched_names[ind1] = ';'.join([str(s) for s in name2_set])
            for name2 in name2_set:
                ind2 = name_to_ind2[name2]
                matched_qual[ind1] = max(matched_qual[ind1],
                                         in_df.iloc[ind2]['qual'])
                match = tk_sv_io.extract_sv_info(in_df.iloc[ind2]['info'],
                                                 ['MATCHES'])[0]
                is_match_correct = (match != '.' and match != ''
                                    and not match is None)
                if is_match_correct:
                    new_matches[ind1].add(match)
                    # Never set back to False if it was set to true.
                    is_correct[ind1] = True
                is_filtered[ind1] = in_df.iloc[ind2]['filters'] != '.'
                if in_df.iloc[ind2]['filters'] != '.':
                    new_filters[ind1] = new_filters[ind1].union(
                        set(in_df.iloc[ind2]['filters'].split(';')))
                if in_df.iloc[ind2]['chrom1'] != in_df.iloc[ind2]['chrom2']:
                    tmp_dist[ind1] = -1
                else:
                    tmp_dist[ind1] = in_df.iloc[ind2]['start2'] - in_df.iloc[
                        ind2]['stop1']

        merged_df[str(sample) + '_matches'] = matched_names
        merged_df[str(sample) + '_qual'] = matched_qual
        merged_df[str(sample) + '_correct'] = is_correct
        merged_df[str(sample) + '_filtered'] = is_filtered
        merged_df[str(sample) + '_dist'] = tmp_dist

    info_strs = ['.' for i in range(num_merged)]
    filter_strs = ['.' for i in range(num_merged)]
    for i in range(num_merged):
        match_str = ','.join(
            new_matches[i]) if len(new_matches[i]) > 0 else '.'
        info_strs[i] = tk_sv_io.update_info('.', ['MATCHES'], [match_str])
        filter_strs[i] = ';'.join(
            new_filters[i]) if len(new_filters[i]) > 0 else '.'

    merged_df['qual'] = np.array(np.max(
        merged_df[[str(s) + '_qual' for s in sample_names]], axis=1),
                                 dtype=np.int)
    merged_df['filters'] = filter_strs
    merged_df['info'] = info_strs
    merged_df.sort(
        ['qual', 'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
        ascending=[0, 1, 1, 1, 1, 1, 1],
        inplace=True)

    return merged_df
コード例 #24
0
ファイル: utils.py プロジェクト: umccr/longranger
def merge_breaks(bedpe_df,
                 out_bedpe,
                 merge_win=10000,
                 max_range=np.inf,
                 max_nmates=np.inf,
                 cluster_qual_factor=0.2):
    """Merges a set of SVs into a non-redundant set.
    Args:
    - bedpe_df: Either a bedpe file or a DataFrame like the one returned by
    tk_sv_io.read_sv_bedpe_to_df.
    - out_bedpe: Path to file where output will be written.
    - merge_win: Breakpoints will be merged if they are within this distance from
    each other. Two SVs will be merged if both their breakpoints can be merged.
    - max_range: See max_range field of cluster_loci.
    - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1,
    and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so
    breakpoint clusters), of the first breakpoint of an SV.
    SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the
    output.

    Return value:
    The output BEDPE.
    """
    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)
    breaks = []
    for i in range(bedpe_df.shape[0]):
        breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1],
                       bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1)))
        breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4],
                       bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = {}
    for i in range(bedpe_df.shape[0]):
        name = bedpe_df.iloc[i]['name']
        cluster_idx1 = mem_to_cluster[(name, 1)]
        cluster_idx2 = mem_to_cluster[(name, 2)]
        if not (cluster_idx1, cluster_idx2) in cluster_pairs:
            cluster_pairs[(cluster_idx1, cluster_idx2)] = [i]
        else:
            old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0]
            # Make sure the old and the new pair have breaks on the same chromosomes
            assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0])
            assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3])
            cluster_pairs[(cluster_idx1, cluster_idx2)].append(i)

    new_cluster_pairs = {}
    cluster_dist_ratio = {}
    for p, pos_list in cluster_pairs.iteritems():
        pos_arr = np.array(pos_list)
        tmp_df = get_dataframe_loc(bedpe_df, pos_arr)
        quals = np.array(tmp_df.qual)
        best_call = pos_arr[np.argmax(quals)]
        close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0]
        close_df = get_dataframe_loc(tmp_df, close_calls)

        same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[
            best_call]['chrom1']
        min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1)
        max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[
            best_call]['stop1']

        new_cluster_pairs[p] = best_call
        if not same_chrom or max_break_dist > MAX_FRAG_SIZE:
            cluster_dist_ratio[p] = '.'
        elif min_break_dist <= 0:
            cluster_dist_ratio[p] = float('NaN')
        else:
            cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist

    cluster_pairs = new_cluster_pairs

    def clusters_close(i, j):
        chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[
            i, 1], bedpe_df.iloc[i, 2]
        chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[
            i, 4], bedpe_df.iloc[i, 5]
        next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[
            j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2]
        next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[
            j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5]
        dist1 = max(next_start1 - stop1, start1 - next_stop1)
        dist2 = max(next_start2 - stop2, start2 - next_stop2)
        return (chrom1 == next_chrom1 and chrom2 == next_chrom2
                and dist1 <= merge_win and dist2 <= merge_win)

    # The "chain-breaking" in cluster_loci might still leave some redundancy.
    # In particular, we might leave some almost touching clusters that were
    # separated only because of chain-breaking. Do a second round of clustering
    # where you go through consecutive pairs of cluster and merge them if they're merge-able.
    new_cluster_pairs = {}
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        if cluster_pairs[(cluster1, cluster2)] == -1:
            continue
        # Consider all neighboring clusters after this cluster.
        # Notice that the cluster indices are sorted by genomic coordinates.
        neigh_clusters = [
            (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1),
            (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1)
        ]
        idx = cluster_pairs[(cluster1, cluster2)]
        # Best cluster among neighboring clusters
        max_cluster = ((cluster1, cluster2), idx)
        for next_cluster1, next_cluster2 in neigh_clusters:
            if not (next_cluster1, next_cluster2) in cluster_pairs:
                continue
            if cluster_pairs[(next_cluster1, next_cluster2)] == -1:
                continue
            next_idx = cluster_pairs[(next_cluster1, next_cluster2)]
            if clusters_close(idx, next_idx):
                cluster_pairs[(next_cluster1, next_cluster2)] = -1
                if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']:
                    max_cluster = ((next_cluster1, next_cluster2), next_idx)
        new_cluster_pairs[max_cluster[0]] = max_cluster[1]

    cluster_pairs = new_cluster_pairs

    # Now compute the number of mate breakpoints for each cluster
    num_mates = {}
    for (cluster1, cluster2) in cluster_pairs.keys():
        if not cluster1 in num_mates:
            num_mates[cluster1] = 0
        if not cluster2 in num_mates:
            num_mates[cluster2] = 0
        num_mates[cluster1] += 1
        if cluster2 != cluster1:
            num_mates[cluster2] += 1

    sel_loc = []
    new_info_strs = []
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        sv_loc = cluster_pairs[(cluster1, cluster2)]
        if num_mates[cluster1] > max_nmates and num_mates[
                cluster2] > max_nmates:
            continue
        sel_loc.append(sv_loc)
        new_info_strs.append(
            tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'],
                                 ['NMATES1', 'NMATES2', 'RESOLUTION'], [
                                     num_mates[cluster1], num_mates[cluster2],
                                     cluster_dist_ratio[(cluster1, cluster2)]
                                 ]))
    if len(sel_loc) > 0:
        bedpe_df = bedpe_df.iloc[sel_loc]
        bedpe_df['info'] = new_info_strs
    else:
        bedpe_df = pd.DataFrame(columns=bedpe_df.columns)
    if not out_bedpe is None:
        tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe)

    return bedpe_df
コード例 #25
0
ファイル: utils.py プロジェクト: umccr/longranger
def compare_breaks(pred_loci, true_loci=None, max_dist=100, window_loci=None):
    """
    pred_file: BEDPE file with sv calls or pandas DataFrame as returned by tk_sv_io.read_sv_bedpe_to_df
    true_file: BEDPE file with ground truth variants (or other set of variants against
        which pred_file will be compared)
    max_dist: maximum distance between a true and a predicted breakpoint in order to
        say that they overlap
    window_loci: list of tuples (chrom, starts, stops), where chrom is a chromosome name and
        starts/stops are lists/arrays of start and ending positions. If this is provided,
        true svs that completely fall within such a locus will be marked as "filtered" (i.e.
        not detectable). For example, these can be the windows used for detecting svs.
        An SV that lies completely within a single window cannot be detected.
    """

    if true_loci is None or pred_loci is None:
        return ({}, {}, set([]))

    ###### Read predicted breakpoints and extend them by max_dist
    pred_breaks1 = []
    pred_breaks2 = []
    if not isinstance(pred_loci, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(pred_loci)
    else:
        bedpe_df = pred_loci
    for n, row in bedpe_df.iterrows():
        break1 = (row.chrom1, max(0, row.start1 - max_dist),
                  row.stop1 + max_dist, row['name'])
        break2 = (row.chrom2, max(0, row.start2 - max_dist),
                  row.stop2 + max_dist, row['name'])
        if break1 > break2:
            break1, break2 = break2, break1
        pred_breaks1.append(break1)
        pred_breaks2.append(break2)

    pred_regions1 = loci_to_named_region_map(pred_breaks1, singletons=True)
    pred_regions2 = loci_to_named_region_map(pred_breaks2, singletons=True)

    ###### Read provided loci
    regions = loci_to_region_map(window_loci)

    ###### Read true svs
    filtered_svs = set([])  # set of true svs that are non-detectable
    if not isinstance(true_loci, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(true_loci)
    else:
        bedpe_df = true_loci
    true_breaks1 = []
    true_breaks2 = []
    for n, row in bedpe_df.iterrows():
        name = row['name']
        chrom1, start1, stop1 = row.chrom1, row.start1, row.stop1
        chrom2, start2, stop2 = row.chrom2, row.start2, row.stop2
        break1 = (chrom1, start1, stop1, name)
        break2 = (chrom2, start2, stop2, name)
        if break1 > break2:
            break1, break2 = break2, break1

        is_filtered = False
        if not regions is None and chrom1 == chrom2 and chrom1 in regions:
            # SV is filtered if both its breakpoints are on the same window
            ovs1 = regions[chrom1].overlapping_regions(start1, stop1)
            ovs2 = regions[chrom2].overlapping_regions(start2, stop2)
            if len(set(ovs1).intersection(set(ovs2))) > 0:
                is_filtered = True
                filtered_svs.add(name)
        if not is_filtered:
            true_breaks1.append(break1)
            true_breaks2.append(break2)

    true_regions1 = loci_to_named_region_map(true_breaks1, singletons=True)
    true_regions2 = loci_to_named_region_map(true_breaks2, singletons=True)

    ###### Get overlaps bewtween predicted and true breakpoints
    mapping_break1 = get_region_overlaps(pred_regions1, true_regions1)
    mapping_break2 = get_region_overlaps(pred_regions2, true_regions2)

    pred_to_matching_true = {}
    true_to_matching_pred = {}
    for pred_name, matched in mapping_break1.iteritems():
        if not pred_name in mapping_break2:
            # There was a match only for one of the breakpoints of this predicted sv.
            continue
        for true_name in matched:
            if true_name in mapping_break2[pred_name]:
                s1 = pred_to_matching_true.setdefault(pred_name, set([]))
                s1.add(true_name)
                s1 = true_to_matching_pred.setdefault(true_name, set([]))
                s1.add(pred_name)

    return (pred_to_matching_true, true_to_matching_pred, filtered_svs)
コード例 #26
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    pred_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_calls)
    pred_df = tk_sv_utils.get_dataframe_loc(
        pred_df, list(range(args.start_idx, args.stop_idx)))

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)

    cov_reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']

    has_pileups = np.zeros((len(pred_df), ), dtype=np.bool)

    for i, (_, row) in enumerate(pred_df.iterrows()):
        has_clipped1 = has_too_many_clipped(
            in_bam,
            row.chrom1,
            max(0, row.start1 - BREAK_EXT),
            row.stop1 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped2 = has_too_many_clipped(
            in_bam,
            row.chrom2,
            max(0, row.start2 - BREAK_EXT),
            row.stop2 + BREAK_EXT,
            max_clipped_frac=args.max_clipped_frac)
        has_clipped = has_clipped1 and has_clipped2

        if row.chrom1 != row.chrom2 or row.start2 - row.stop1 > MAX_FRAG_SIZE:
            has_pileups[i] = has_clipped
            continue

        cov = cov_reader.query(
            (row.chrom1, max(0,
                             row.start1 - BREAK_EXT), row.stop2 + BREAK_EXT))
        cov['bin'] = np.array(cov['pos'] / BIN_WIN, dtype=np.int)
        if not 'coverage_deduped' in cov.columns:
            cov['coverage_deduped'] = cov[sel_cols].sum(axis=1)
        cov_arr = np.array(cov.groupby('bin').mean()['coverage_deduped'])
        median_cov = np.median(cov_arr)

        # Rescue for deletions or duplications with breakpoints on the pileups
        sv_len = row.stop2 - row.start1
        side_cov = cov_reader.query(
            (row.chrom1, max(0, row.start1 - BREAK_EXT - sv_len / 2),
             row.start1 - BREAK_EXT))
        side_cov = pd.concat([
            side_cov,
            cov_reader.query((row.chrom2, row.stop2 + BREAK_EXT,
                              row.stop2 + BREAK_EXT + sv_len / 2))
        ],
                             ignore_index=True)
        if not 'coverage_deduped' in side_cov.columns:
            side_cov['coverage_deduped'] = side_cov[sel_cols].sum(axis=1)

        # Ignore pileups, enough evidence for a large-scale copy number variant
        if np.median(cov.coverage_deduped) < DEL_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue
        if np.median(cov.coverage_deduped) > DUP_REL_COV * np.median(
                side_cov.coverage_deduped):
            continue

        # Filter out the call if there are pileups very close to the breakpoints
        has_pileups[i] = len(cov_arr) > 4 and np.any(
            cov_arr[[0, 1, -2, -1]] > args.min_rel_depth * median_cov)
        has_pileups[i] = has_pileups[i] or has_clipped

    pileups = pred_df[has_pileups]
    pred_df = pred_df[np.logical_not(has_pileups)]

    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_calls)
    tk_sv_io.write_sv_df_to_bedpe(pileups, outs.pileups)