コード例 #1
0
def get_df_region_dist(sv_df, regions, region_names=None, use_orient=False):
    """Computes the distance between the breakpoints of the BEDPE
    (read as dataframe) and a set of regions.
    region_names is a dict (start, stop) -> name."""
    dists1 = np.inf * np.ones((len(sv_df), ))
    dists2 = np.inf * np.ones((len(sv_df), ))
    regions1 = [(None, None) for i in range(len(sv_df))]
    regions2 = [(None, None) for i in range(len(sv_df))]
    matched_names1 = ['.' for i in range(len(sv_df))]
    matched_names2 = ['.' for i in range(len(sv_df))]

    for i, (_, row) in enumerate(sv_df.iterrows()):
        if use_orient:
            sv_type = tk_sv_io.get_sv_type(row.info)
            if sv_type == 'DEL':
                orient = '-+'
            elif sv_type == 'DUP':
                orient = '+-'
            elif sv_type == 'INV':
                orient = '..'
            else:
                orient = tk_sv_io.get_break_orientation(row.info)
            if (orient == '..' and
                    sv_type != 'INV') or sv_type == 'UNK' or sv_type == 'INS':
                continue
            orient1 = tk_regions.Dirs.from_str(orient[0])
            orient2 = tk_regions.Dirs.from_str(orient[1])
        else:
            orient1, orient2 = (None, None)
        chrom1, chrom2 = row.chrom1, row.chrom2
        if chrom1 in regions:
            s1, e1, d1 = regions[chrom1].get_closest_region_to_region(
                row.start1, row.stop1, direction=orient1)
            if not s1 is None:
                d1 = int(d1)
                regions1[i] = (s1, e1)
                if not region_names is None and (s1, e1) in region_names:
                    matched_names1[i] = ','.join(list(region_names[(s1, e1)]))
        else:
            d1 = np.inf
        if chrom2 in regions:
            s2, e2, d2 = regions[chrom2].get_closest_region_to_region(
                row.start2, row.stop2, direction=orient2)
            if not s2 is None:
                d2 = int(d2)
                regions2[i] = (s2, e2)
                if not region_names is None and (s2, e2) in region_names:
                    matched_names2[i] = ','.join(list(region_names[(s2, e2)]))
        else:
            d2 = np.inf
        dists1[i] = d1
        dists2[i] = d2
    return (dists1, dists2, regions1, regions2, matched_names1, matched_names2)
コード例 #2
0
ファイル: __init__.py プロジェクト: umccr/longranger
def prepare_loci(args):
    """Merge and sort input lists of candidate loci."""

    overlap_loci = []

    # Loci based on barcode overlaps. Type of SV is unknown.
    if not args.overlap_loci is None:
        with open(args.overlap_loci, 'rb') as f:
            loci = cPickle.load(f)
        overlap_loci.extend([(x[0], x[1], x[2], x[3], x[4], x[5], None) for x in loci])

    # Low depth loci. These will only be evaluated for deletions.
    if not args.low_depth_loci is None:
        del_calls = tk_sv_io.read_sv_bedpe_to_df(args.low_depth_loci)
        for _, row in del_calls.iterrows():
            overlap_loci.append((row.chrom1, row.start1, row.stop1,
                                 row.chrom2, row.start2, row.stop2, 'DEL'))

    # Loci based on read-pair support. These will only be evaluated for the
    # type of SV supported by the readpairs.
    if not args.rp_calls is None:
        rp_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls)
        for _, row in rp_calls.iterrows():
            sv_type = tk_sv_io.get_sv_type(row.info)
            if not sv_type in ['DEL', 'INV', 'DUP']:
                sv_type = None
            else:
                sv_type = [sv_type]
            overlap_loci.append((row.chrom1, row.start1, row.stop1,
                                 row.chrom2, row.start2, row.stop2, sv_type))

    # Sort by position and also get the sorted indices.
    sorted_overlap_loci = sorted(overlap_loci,
                                 key=lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
    sorted_overlap_loci_idx = sorted(range(len(overlap_loci)),
                                     key=lambda x: (overlap_loci[x][0], overlap_loci[x][1],
                                                    overlap_loci[x][2], overlap_loci[x][3],
                                                    overlap_loci[x][4], overlap_loci[x][5]))

    # If there is a single source of candidate loci, coming from a BEDPE, then
    # keep track of the names in the BEDPE, so you can annotate SV-calls made
    # with the BEDPE line from which they came.
    if args.overlap_loci is None and args.low_depth_loci is None and not args.rp_calls is None:
        input_calls = tk_sv_io.read_sv_bedpe_to_df(args.rp_calls)
        input_names = list(input_calls['name'])
        input_names = [input_names[n] for n in sorted_overlap_loci_idx]
    else:
        input_names = None

    return sorted_overlap_loci, input_names
コード例 #3
0
def merge_calls_and_gt(call_df, gt_df, call_to_gt):

    if not gt_df is None:
        gt_df.index = gt_df['name']
    else:
        call_to_gt = {}

    out_call_df = None
    for _, row in call_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        orient = tk_sv_io.get_break_orientation(row.info)
        row['orient'] = orient

        # revert sv type name from DISTAL to TRANS to match ground truth
        # conventions
        if sv_type == 'DISTAL':
            sv_type = 'TRANS'
        row['sv_type'] = sv_type

        matches = list(call_to_gt.get(row['name'], [None]))
        # One output row per match
        for m in matches:
            row['match'] = m
            if not m is None and not gt_df is None:
                x = gt_df.loc[m]
                row['match_dist'] = max(
                    dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1,
                                   x.stop1),
                    dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2,
                                   x.stop2))
            else:
                row['match_dist'] = float('NaN')

            out_call_df = pd.concat(
                [out_call_df, pd.DataFrame([row])], ignore_index=True)

    if not gt_df is None:
        out_call_df = pd.merge(out_call_df,
                               gt_df,
                               left_on='match',
                               right_on='name',
                               how='outer',
                               suffixes=['', '_gt'])
        out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True)
    out_call_df.sort('name', inplace=True)

    return out_call_df
コード例 #4
0
def prepare_gt(args):
    if args.gt_variants is None:
        return None

    true_df = tk_sv_io.read_sv_bedpe_to_df(args.gt_variants)

    # Length of ground truth sv
    true_df['dist'] = tk_sv_io.get_sv_df_dists(true_df)

    sv_types = []
    orients = []
    tiers = []

    # Mark genic SVs
    is_genic1 = np.zeros((len(true_df), ), dtype=np.int)
    is_genic2 = np.zeros((len(true_df), ), dtype=np.int)
    gene_regions = tk_reference.load_gene_boundaries(args.reference_path,
                                                     protein_coding=False)

    for row_idx, (_, row) in enumerate(true_df.iterrows()):
        if not 'info' in true_df.columns:
            sv_types.append('UNK')
            orients.append('..')
            tiers.append(0)
        else:
            sv_type = tk_sv_io.get_sv_type(row.info)
            if sv_type == 'DISTAL':
                sv_type = 'TRANS'
            sv_types.append(sv_type)
            orients.append(tk_sv_io.get_break_orientation(row.info))
            tiers.append(tk_sv_io.get_tier(row.info))

        is_genic1[row_idx] = int(
            row.chrom1 in gene_regions
            and bool(gene_regions[row.chrom1].overlapping_regions(
                row.start1, row.stop1)))
        is_genic2[row_idx] = int(
            row.chrom2 in gene_regions
            and bool(gene_regions[row.chrom2].overlapping_regions(
                row.start2, row.stop2)))

    true_df['break1_genic'] = is_genic1
    true_df['break2_genic'] = is_genic2
    # number of breakpoints overlapping genes
    true_df['genic_breaks'] = is_genic1 + is_genic2

    # put all the un-tiered entries into the last tier
    tiers = np.array(tiers)
    if len(tiers) == 0:
        total_tiers = 0
    else:
        total_tiers = np.max(tiers)

    tiers[tiers == 0] = total_tiers + 1

    true_df['tier'] = tiers
    true_df['sv_type'] = sv_types
    true_df['orient'] = orients

    if not args.min_sv_len is None:
        # Select only intra-chromosomal or svs that have a minimum distance between breakpoints
        is_feasible = np.array(true_df['dist'] >= args.min_sv_len,
                               dtype=np.bool)

    if not args.targets is None and not args.target_dists is None:
        target_regions = tk_sv_utils.bed_to_region_map(args.targets,
                                                       merge=True)
        res = get_df_region_dist(true_df, target_regions, use_orient=True)
        targ_dists1, targ_dists2, targs1, targs2, _, _ = res

        new_starts1 = np.array(true_df.start1)
        new_stops1 = np.array(true_df.stop1)
        new_starts2 = np.array(true_df.start2)
        new_stops2 = np.array(true_df.stop2)

        for i, (t1, t2) in enumerate(zip(targs1, targs2)):
            if not t1[0] is None and not t2[0] is None:
                new_starts1[i], new_stops1[i] = t1
                new_starts2[i], new_stops2[i] = t2

        true_df['start1'] = new_starts1
        true_df['stop1'] = new_stops1
        true_df['start2'] = new_starts2
        true_df['stop2'] = new_stops2

        true_df['targ_dist'] = np.maximum(np.array(targ_dists1),
                                          np.array(targ_dists2))
    else:
        true_df['targ_dist'] = np.zeros((len(true_df), ), dtype=np.int)

    true_df['feasible'] = is_feasible

    return true_df
コード例 #5
0
def add_filters(pred_df, pred_to_match, black_dists1, black_dists2,
                black_names1, black_names2, seg_dup_calls, all_bad_regions,
                args):

    if not args.targets is None:
        min_call_qv = args.min_call_qv_target
    else:
        min_call_qv = args.min_call_qv_wgs

    if args.coverage is None:
        # used for WGS
        max_bc_cov = SV_DEFAULT_MAX_BC_COV
        bc_mean_depth = 200
    else:
        # used for exome
        with open(args.coverage, 'r') as f:
            cov_res = json.load(f)
        bc_summary_depth_info = cov_res['summary_bc_depth_info']
        bc_mean_depth, _, _ = get_depth_info_json(bc_summary_depth_info)
        max_bc_cov = args.max_bc_cov_factor * bc_mean_depth

    if args.keep_filters:
        filter_strs = [s for s in pred_df.filters]
    else:
        filter_strs = ['.' for i in range(len(pred_df))]

    info_strs = [s for s in pred_df['info']]
    rps = np.zeros((len(pred_df), ), dtype=np.int)

    def get_cov_frac(black_regions, chrom, start, stop):
        regions = tk_sv_utils.strictly_overlapping_regions(
            black_regions, chrom, start, stop)
        tot_black = np.sum([r[1] - r[0] for r in regions])
        tot_len = float(stop - start)
        black_frac = tk_stats.robust_divide(tot_black, tot_len)
        return black_frac

    for i, (_, row) in enumerate(pred_df.iterrows()):
        npairs = tk_sv_io.get_npairs(row['info'])
        nsplit = tk_sv_io.get_nsplit(row['info'])
        rps[i] = npairs + nsplit
        sv_type = tk_sv_io.get_sv_type(row['info'])
        name = row['name']
        qual = row.qual

        ####### Filtering for read-pair calls #######
        frac_on_hap = tk_sv_io.extract_sv_info(row.info,
                                               ['FRAC_HAP_SUPPORT'])[0]
        allelic_frac = tk_sv_io.extract_sv_info(row.info,
                                                ['HAP_ALLELIC_FRAC'])[0]
        if allelic_frac != '':
            allelic_frac = float(allelic_frac)

        if args.is_germline is None:
            if qual < min_call_qv:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
            if not args.min_allelic_frac is None and not frac_on_hap is None and \
               frac_on_hap != '' and float(frac_on_hap) < args.min_allelic_frac:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
            if not args.min_allelic_frac is None and allelic_frac != '' and \
               float(allelic_frac) < args.min_allelic_frac:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
        elif args.targets is None:
            if args.is_germline:
                martian.log_info('Mean barcode depth {}'.format(bc_mean_depth))

                min_call_qv = max(min_call_qv, bc_mean_depth / 10.0)

                martian.log_info(
                    'Support cutoff: {} barcodes'.format(min_call_qv))

                enough_bcs = qual >= min_call_qv
                is_good = allelic_frac > 0.8 or (sv_type == 'INV'
                                                 and allelic_frac > 0.6)
                is_good = is_good and enough_bcs
                if not is_good:
                    filter_strs[i] = tk_sv_io.update_filters(
                        filter_strs[i], 'LOWQ', None)
            else:
                min_call_qv = max(min_call_qv, 4)
                is_good = allelic_frac > 0.6 and qual >= min_call_qv
                if not is_good:
                    filter_strs[i] = tk_sv_io.update_filters(
                        filter_strs[i], 'LOWQ', None)
        else:
            if args.is_germline:
                # Harder to get confident support in Exome
                min_call_qv = max(min_call_qv, bc_mean_depth / 10.0)
                martian.log_info(
                    'Support cutoff: {} barcodes'.format(min_call_qv))
                # Apply a very lenient filter on allelic fraction because lots of barcodes can be unphased
                is_good = qual >= min_call_qv and allelic_frac > 0.05
                af = tk_sv_io.extract_sv_info(row.info, ['ALLELIC_FRAC'])[0]
                if af != '':
                    af = float(af)
                is_good = is_good and af > 0.04
            else:
                min_call_qv = max(min_call_qv, 4)
                is_good = qual >= min_call_qv

            if not is_good:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)

        if not black_dists1 is None:
            chrom1, chrom2 = row.chrom1, row.chrom2
            black_dist1, black_dist2 = black_dists1[i], black_dists2[i]

            if chrom1 == chrom2:
                if chrom1 in all_bad_regions:
                    black_frac = get_cov_frac(all_bad_regions, chrom1,
                                              row.stop1, row.start2)
                else:
                    black_frac = 0.0
            else:
                black_frac = float('NaN')
        else:
            black_dist1 = np.inf
            black_dist2 = np.inf
            black_frac = float('NaN')
        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_DIST',
                                                 min(black_dist1, black_dist2),
                                                 args.min_dist_from_black)
        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_FRAC',
                                                 black_frac, 0,
                                                 args.max_frac_black)

        bname1 = '.'
        bname2 = '.'
        if black_dist1 < args.min_dist_from_black or re.search(
                'BLACK_FRAC', filter_strs[i]):
            bname1 = black_names1[i]
        if black_dist2 < args.min_dist_from_black or re.search(
                'BLACK_FRAC', filter_strs[i]):
            bname2 = black_names2[i]

        if name in seg_dup_calls:
            filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'SEG_DUP',
                                                     None)
            seg_dup_match = ','.join(list(seg_dup_calls[name]))
        else:
            seg_dup_match = '.'

        nbcs1 = tk_sv_io.get_nbcs1(row.info)
        nbcs2 = tk_sv_io.get_nbcs2(row.info)
        if not nbcs1 is None and not nbcs2 is None and (nbcs1 > max_bc_cov
                                                        or nbcs2 > max_bc_cov):
            filter_strs[i] = tk_sv_io.update_filters(filter_strs[i],
                                                     'HIGH_BC_COV', None)

        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i],
                                                 'READ_SUPPORT',
                                                 npairs + nsplit,
                                                 min_val=args.min_read_support)

        match_str = ','.join([str(s) for s in pred_to_match.get(name, '.')])

        if not args.targets is None:
            # Disable orientation reporting in exome
            info_strs[i] = tk_sv_io.update_info(info_strs[i], ['ORIENT'],
                                                [None])

        info_strs[i] = tk_sv_io.update_info(info_strs[i], [
            'BLACK_DIST1', 'BLACK_DIST2', 'BLACK_FRAC', 'BLACK1', 'BLACK2',
            'MATCHES', 'SEG_DUP'
        ], [
            black_dist1, black_dist2, black_frac, bname1, bname2, match_str,
            seg_dup_match
        ])

    pred_df['filters'] = filter_strs
    pred_df['info'] = info_strs
    pred_df['read_support'] = rps

    return pred_df, min_call_qv
コード例 #6
0
def join(args, outs, chunk_defs, chunk_outs):
    pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs)

    # Change TRANS type to DISTAL. This change will only
    # affect the type reported not the names of the metrics.
    new_info = []
    for _, row in pred_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        if sv_type == 'TRANS':
            sv_type = 'DISTAL'
        new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type]))
    pred_df['info'] = new_info

    if not true_df is None:
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')

    ##### Write BEDPE/VCF outputs
    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates)
    source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format(
        martian.get_pipelines_version())
    sample_id = 'sample' if args.sample_id is None else args.sample_id
    tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id,
                          source_str, args.reference_path)
    # this will sort and gzip
    tk_sv_io.index_sv_vcf(outs.svs.strip(".gz"))
    outs.svs_index = outs.svs + '.tbi'
    # delete the non-gzipped file
    os.remove(outs.svs.strip('.gz'))

    if not pred_df.empty:
        call_df = pred_df[np.logical_or(pred_df['filters'] == '.',
                                        pred_df['filters'] == "PASS")]
    else:
        call_df = None
    tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls)

    # Annotate each call with the matching ground truth svs. The resulting
    # dataframe might have multiple rows for the same call if there were multiple
    # matching ground truth svs.
    martian.log_info("merging calls and gt")
    if not pred_df.empty:
        pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match)

    martian.log_info("writing call_tsv")
    pred_df.to_csv(outs.call_tsv,
                   index=False,
                   header=True,
                   sep='\t',
                   na_rep='NaN')

    pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))]

    max_dists = sorted(np.array(args.detect_dists))

    gt_sv_types = get_all_sv_types(true_df)
    call_sv_types = get_all_sv_types(pred_df)

    if not true_df is None:
        # Use the default MAX_PPV_TIER unless this is greater than the maximum tier
        # present in the data.
        max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier))
        # Use the default unless this is smaller than the minimum tier present in
        # the data.
        max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier))
    else:
        max_ppv_tier = 1
        max_sens_tier = 1

    tiers = [max_ppv_tier, max_sens_tier]

    # All combinations of filters in ground truth and call set
    if not args.targets is None and not args.target_dists is None:
        target_dists = list(sorted(np.array(args.target_dists,
                                            dtype=np.float)))
        target_dists.append(float('NaN'))
    else:
        target_dists = [float('NaN')]

    combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers,
                    [True, False], call_sv_types, max_dists)

    metrics = defaultdict(list)

    gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier']
    call_filters = ['call_filtered', 'call_sv_type', 'match_dist']

    for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type,
         dist) in combs:
        if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type:
            continue

        metrics['genic_breaks'].append(genic_breaks)
        metrics['target_dist'].append(tdist)
        metrics['gt_sv_type'].append(gt_sv_type)
        metrics['tier'].append(tier)
        metrics['call_filtered'].append(is_filtered)
        metrics['call_sv_type'].append(call_sv_type)
        metrics['match_dist'].append(dist)

        if true_df is None:
            sel_true_df = None
        else:
            sel_true_df = true_df
            if gt_sv_type != 'NA':
                sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type]
            if not np.isnan(tdist):
                sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist]
            sel_true_df = sel_true_df[sel_true_df.tier <= tier]
            # Restrict to genic or non-genic or take everything if this is None.
            if not genic_breaks is None:
                sel_true_df = sel_true_df[sel_true_df.genic_breaks ==
                                          genic_breaks]

            if len(sel_true_df) == 0:
                sel_true_df = None

        sel_pred_df = pred_df

        if is_filtered and not pred_df.empty:
            sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') |
                                      (sel_pred_df.filters == 'PASS')]
        if call_sv_type != 'NA' and not pred_df.empty:
            sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type]
        if not pred_df.empty and (args.min_rel_overlap is None
                                  or args.min_rel_overlap == 0):
            # Do not apply thi filter if the matching is done based on overlap.
            sel_pred_df = sel_pred_df[np.logical_or(
                np.isnan(sel_pred_df.match_dist),
                sel_pred_df.match_dist <= dist)]

        add_metrics(sel_pred_df, sel_true_df, metrics)

    column_names = gt_filters
    column_names.extend(call_filters)
    other_names = set(metrics.keys()).difference(set(column_names))
    column_names.extend(other_names)

    metric_df = pd.DataFrame(metrics)
    metric_df = metric_df[column_names]

    martian.log_info("writing summary tsv")
    metric_df.to_csv(outs.summary_tsv,
                     index=False,
                     header=True,
                     sep='\t',
                     na_rep='NaN')

    short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier,
                                      max_sens_tier, args)

    if not args.call_summary is None:
        with open(args.call_summary, 'r') as in_summary_fn:
            in_summary = json.load(in_summary_fn)
            for key, val in in_summary.iteritems():
                short_metrics[key] = val

    short_metrics['min_qv'] = min_qv

    with open(outs.summary, 'w') as out_file:
        out_file.write(
            tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
コード例 #7
0
def main(args, outs):
    sv_df = read_bedpes(args)
    sv_df = tk_sv_utils.get_dataframe_loc(
        sv_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    print >> sys.stderr, 'max insert', max_insert

    if max_insert is None:
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        tk_sv_io.write_sv_df_to_bedpe(sv_df, outs.non_pass_sv_calls)
        return

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']
    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_STR: summary['far_chimera_rate']
    }

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)

    pass_calls = []
    non_pass_calls = []

    for i, (_, row) in enumerate(sv_df.iterrows()):
        sv_type = tk_sv_io.get_sv_type(row.info)

        middle = int(0.5 * (row.stop1 + row.start2))

        # Bail out on all non deletions
        if sv_type != tk_readpairs.DEL_STR:
            continue

        if row.chrom1 == row.chrom2:
            r1 = (max(0, row.start1 - args.break_pad),
                  min(middle, row.stop1 + args.break_pad))
            r2 = (max(middle,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)

            if row.start2 - row.stop1 > 4 * args.break_pad:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
            else:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
        else:
            r1 = (max(0,
                      row.start1 - args.break_pad), row.stop1 + args.break_pad)
            r2 = (max(0,
                      row.start2 - args.break_pad), row.stop2 + args.break_pad)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        bc_cov1 = len(get_frag_coverage(frag_phasing, row.chrom1, r1[0],
                                        r1[1]))
        bc_cov2 = len(get_frag_coverage(frag_phasing, row.chrom2, r2[0],
                                        r2[1]))
        if sv_type == tk_readpairs.DEL_STR and max(bc_cov1,
                                                   bc_cov2) > MAX_DEL_BC_DEPTH:
            print >> sys.stderr, 'Too many barcodes in DEL candidate', row.chrom1, row.start1, row.stop2
            continue

        readpairs = tk_readpairs.get_readpairs(in_bam,
                                               chroms,
                                               starts,
                                               stops,
                                               max_insert=max_insert,
                                               min_mapq=args.min_mapq)

        normal_readpairs = [
            rp for rp in readpairs if rp.sv_type == tk_readpairs.NORMAL_STR
        ]
        if len(normal_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(normal_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(normal_readpairs))
        normal_readpairs = [normal_readpairs[ridx] for ridx in sel]

        # Distal readpairs across the breakpoints
        dist_readpairs = [
            rp for rp in readpairs if rp.sv_type == sv_type and (
                (tk_readpairs.pos_overlaps(rp.read1.pos, r1)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r2)) or
                (tk_readpairs.pos_overlaps(rp.read1.pos, r2)
                 and tk_readpairs.pos_overlaps(rp.read2.pos, r1)))
        ]
        if len(dist_readpairs) > MAX_DEL_READPAIRS:
            sel = np.random.choice(len(dist_readpairs), MAX_DEL_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        dist_readpairs.extend(normal_readpairs)
        if sv_type == tk_readpairs.DEL_STR and len(starts) == 2:
            more_readpairs = tk_readpairs.get_readpairs(in_bam, [row.chrom1],
                                                        [r1[1] + 1],
                                                        [r2[0] - 1],
                                                        max_insert=max_insert,
                                                        min_mapq=args.min_mapq,
                                                        normal_only=True)
            if len(more_readpairs) > MAX_DEL_READPAIRS:
                sel = np.random.choice(len(more_readpairs), MAX_DEL_READPAIRS)
            else:
                sel = np.arange(len(more_readpairs))
            dist_readpairs.extend([
                more_readpairs[ridx] for ridx in sel
                if more_readpairs[ridx].sv_type == tk_readpairs.NORMAL_STR
            ])

        readpairs = sorted(dist_readpairs, key=lambda x: x.barcode)
        read_groups = {}
        for bc, read_group_iter in groupby(dist_readpairs,
                                           lambda x: x.barcode):
            read_groups[bc] = list(read_group_iter)

        bc_set = set(read_groups.keys())
        bc_list = sorted(read_groups.keys())
        phase_set1 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom1, r1[0],
                                               r1[1])
        phase_set2 = tk_sv_utils.get_phase_set(frag_phasing, row.chrom2, r2[0],
                                               r2[1])

        if len(bc_list) < 1:
            print >> sys.stderr, 'Not enough barcodes. Skipping'
            continue

        bc_phase_sets1 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom1,
                                                             r1[0],
                                                             r1[1],
                                                             bc_set,
                                                             in_ps=phase_set1)
        bc_phase_sets2 = tk_sv_utils.get_barcode_phase_probs(frag_phasing,
                                                             row.chrom2,
                                                             r2[0],
                                                             r2[1],
                                                             bc_set,
                                                             in_ps=phase_set2)

        cand_breaks1 = np.arange(r1[0], r1[1] + 1, 5)
        cand_breaks2 = np.arange(r2[0], r2[1] + 1, 5)

        res = tk_readpairs.eval_sv_em(read_groups,
                                      cand_breaks1,
                                      cand_breaks2,
                                      sv_type,
                                      chimera_rates,
                                      phase_set1,
                                      phase_set2,
                                      bc_phase_sets1,
                                      bc_phase_sets2,
                                      max_insert,
                                      ins_logsf_fun,
                                      em_iters=args.em_iters)

        ((no_sv_max, sv_max, het_sv_max), max_locus, zygosity, max_hap,
         prior_hap_probs, hap_probs, support) = res

        lr = sv_max - no_sv_max if max_hap is None else het_sv_max - no_sv_max

        hap_probs1 = hap_probs[:, 0:2]
        hap_probs2 = hap_probs[:, 2:]

        new_call = sv_call.SvCall.from_em_results(
            row.chrom1, row.chrom2, phase_set1, phase_set2,
            (no_sv_max, sv_max, het_sv_max), max_locus,
            sv_call._SvType(sv_type, ('.', '.')), zygosity, max_hap, support,
            (hap_probs1, hap_probs2, None))

        # the break interval is inclusive
        if lr >= args.min_lr and new_call.qual >= args.min_qv and new_call.break2[
                0] - new_call.break1[1] + 1 >= args.min_sv_len:
            pass_calls.append(new_call)
        else:
            # Leave breakpoints unchanged
            new_call.break1 = (row.start1, row.stop1)
            new_call.break2 = (row.start2, row.stop2)
            non_pass_calls.append(new_call)

    out_df = sv_call.SvCall.svs_to_dataframe(pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_calls)

    out_df = sv_call.SvCall.svs_to_dataframe(non_pass_calls)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.non_pass_sv_calls)
    in_bam.close()
    frag_phasing.close()
コード例 #8
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):

    bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    bedpe_df = tk_sv_utils.get_dataframe_loc(
        bedpe_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    if max_insert is None:
        martian.throw('No Q60 reads')

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_trans = summary['far_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']

    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_FF_STR: chimera_rate_trans,
        tk_readpairs.TRANS_FR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RF_STR: chimera_rate_trans
    }

    in_bam = tk_bam.create_bam_infile(args.input)

    out_quals = []
    out_infos = []
    out_chroms1 = []
    out_starts1 = []
    out_stops1 = []
    out_chroms2 = []
    out_starts2 = []
    out_stops2 = []

    for i, (_, row) in enumerate(bedpe_df.iterrows()):
        in_svt = tk_sv_io.get_sv_type(row.info)

        if row.chrom1 == row.chrom2:
            max_ext = min(args.break_extend, int(
                (row.start2 - row.stop1) / 3.0))
            r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext)
            r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend)
            if r1[1] > r2[0]:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
            else:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
        else:
            r1 = (max(0, row.start1 - args.break_extend),
                  row.stop1 + args.break_extend)
            r2 = (max(0, row.start2 - args.break_extend),
                  row.stop2 + args.break_extend)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        readpairs = tk_readpairs.get_readpairs2(in_bam,
                                                chroms,
                                                starts,
                                                stops,
                                                max_insert=max_insert,
                                                min_mapq=args.min_mapq)

        # Distal readpairs across the breakpoints
        dist_readpairs = filter(
            filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs)

        if len(dist_readpairs) > MAX_READPAIRS:
            sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun,
                                         max_insert, chimera_rates)

        if len(res_arr) == 0:
            out_quals.append(row.qual)
            out_chroms1.append(row.chrom1)
            out_starts1.append(row.start1)
            out_stops1.append(row.stop1)
            out_chroms2.append(row.chrom2)
            out_starts2.append(row.start2)
            out_stops2.append(row.stop2)
            out_infos.append(row['info'])
        else:
            if args.best_only:
                res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True)
                res_arr = [res_arr[0]]

            for (lr, num_split, num_pairs, sv_len, support_range, svt,
                 support_readpairs) in res_arr:
                range1, range2 = support_range
                if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None:
                    out_quals.append(row.qual + args.rp_lr_multiplier * lr)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(range1[0])
                    out_stops1.append(range1[1])
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(range2[0])
                    out_stops2.append(range2[1])
                    if svt != in_svt and in_svt != 'TRANS':
                        in_svt = 'UNK'
                else:
                    out_quals.append(row.qual)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(row.start1)
                    out_stops1.append(row.stop1)
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(row.start2)
                    out_stops2.append(row.stop2)

                out_infos.append(
                    tk_sv_io.update_info(
                        row['info'],
                        ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'],
                        [num_pairs, num_split, lr, svt, in_svt]))

    in_bam.close()

    if args.best_only:
        out_names = [n for n in bedpe_df['name']]
    else:
        out_names = np.arange(len(bedpe_df))

    out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1,
                                   out_chroms2, out_starts2, out_stops2,
                                   out_names, out_quals, out_infos)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)