Ejemplo n.º 1
0
def add_filters(pred_df, pred_to_match, black_dists1, black_dists2,
                black_names1, black_names2, seg_dup_calls, all_bad_regions,
                args):

    if not args.targets is None:
        min_call_qv = args.min_call_qv_target
    else:
        min_call_qv = args.min_call_qv_wgs

    if args.coverage is None:
        # used for WGS
        max_bc_cov = SV_DEFAULT_MAX_BC_COV
        bc_mean_depth = 200
    else:
        # used for exome
        with open(args.coverage, 'r') as f:
            cov_res = json.load(f)
        bc_summary_depth_info = cov_res['summary_bc_depth_info']
        bc_mean_depth, _, _ = get_depth_info_json(bc_summary_depth_info)
        max_bc_cov = args.max_bc_cov_factor * bc_mean_depth

    if args.keep_filters:
        filter_strs = [s for s in pred_df.filters]
    else:
        filter_strs = ['.' for i in range(len(pred_df))]

    info_strs = [s for s in pred_df['info']]
    rps = np.zeros((len(pred_df), ), dtype=np.int)

    def get_cov_frac(black_regions, chrom, start, stop):
        regions = tk_sv_utils.strictly_overlapping_regions(
            black_regions, chrom, start, stop)
        tot_black = np.sum([r[1] - r[0] for r in regions])
        tot_len = float(stop - start)
        black_frac = tk_stats.robust_divide(tot_black, tot_len)
        return black_frac

    for i, (_, row) in enumerate(pred_df.iterrows()):
        npairs = tk_sv_io.get_npairs(row['info'])
        nsplit = tk_sv_io.get_nsplit(row['info'])
        rps[i] = npairs + nsplit
        sv_type = tk_sv_io.get_sv_type(row['info'])
        name = row['name']
        qual = row.qual

        ####### Filtering for read-pair calls #######
        frac_on_hap = tk_sv_io.extract_sv_info(row.info,
                                               ['FRAC_HAP_SUPPORT'])[0]
        allelic_frac = tk_sv_io.extract_sv_info(row.info,
                                                ['HAP_ALLELIC_FRAC'])[0]
        if allelic_frac != '':
            allelic_frac = float(allelic_frac)

        if args.is_germline is None:
            if qual < min_call_qv:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
            if not args.min_allelic_frac is None and not frac_on_hap is None and \
               frac_on_hap != '' and float(frac_on_hap) < args.min_allelic_frac:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
            if not args.min_allelic_frac is None and allelic_frac != '' and \
               float(allelic_frac) < args.min_allelic_frac:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)
        elif args.targets is None:
            if args.is_germline:
                martian.log_info('Mean barcode depth {}'.format(bc_mean_depth))

                min_call_qv = max(min_call_qv, bc_mean_depth / 10.0)

                martian.log_info(
                    'Support cutoff: {} barcodes'.format(min_call_qv))

                enough_bcs = qual >= min_call_qv
                is_good = allelic_frac > 0.8 or (sv_type == 'INV'
                                                 and allelic_frac > 0.6)
                is_good = is_good and enough_bcs
                if not is_good:
                    filter_strs[i] = tk_sv_io.update_filters(
                        filter_strs[i], 'LOWQ', None)
            else:
                min_call_qv = max(min_call_qv, 4)
                is_good = allelic_frac > 0.6 and qual >= min_call_qv
                if not is_good:
                    filter_strs[i] = tk_sv_io.update_filters(
                        filter_strs[i], 'LOWQ', None)
        else:
            if args.is_germline:
                # Harder to get confident support in Exome
                min_call_qv = max(min_call_qv, bc_mean_depth / 10.0)
                martian.log_info(
                    'Support cutoff: {} barcodes'.format(min_call_qv))
                # Apply a very lenient filter on allelic fraction because lots of barcodes can be unphased
                is_good = qual >= min_call_qv and allelic_frac > 0.05
                af = tk_sv_io.extract_sv_info(row.info, ['ALLELIC_FRAC'])[0]
                if af != '':
                    af = float(af)
                is_good = is_good and af > 0.04
            else:
                min_call_qv = max(min_call_qv, 4)
                is_good = qual >= min_call_qv

            if not is_good:
                filter_strs[i] = tk_sv_io.update_filters(
                    filter_strs[i], 'LOWQ', None)

        if not black_dists1 is None:
            chrom1, chrom2 = row.chrom1, row.chrom2
            black_dist1, black_dist2 = black_dists1[i], black_dists2[i]

            if chrom1 == chrom2:
                if chrom1 in all_bad_regions:
                    black_frac = get_cov_frac(all_bad_regions, chrom1,
                                              row.stop1, row.start2)
                else:
                    black_frac = 0.0
            else:
                black_frac = float('NaN')
        else:
            black_dist1 = np.inf
            black_dist2 = np.inf
            black_frac = float('NaN')
        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_DIST',
                                                 min(black_dist1, black_dist2),
                                                 args.min_dist_from_black)
        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'BLACK_FRAC',
                                                 black_frac, 0,
                                                 args.max_frac_black)

        bname1 = '.'
        bname2 = '.'
        if black_dist1 < args.min_dist_from_black or re.search(
                'BLACK_FRAC', filter_strs[i]):
            bname1 = black_names1[i]
        if black_dist2 < args.min_dist_from_black or re.search(
                'BLACK_FRAC', filter_strs[i]):
            bname2 = black_names2[i]

        if name in seg_dup_calls:
            filter_strs[i] = tk_sv_io.update_filters(filter_strs[i], 'SEG_DUP',
                                                     None)
            seg_dup_match = ','.join(list(seg_dup_calls[name]))
        else:
            seg_dup_match = '.'

        nbcs1 = tk_sv_io.get_nbcs1(row.info)
        nbcs2 = tk_sv_io.get_nbcs2(row.info)
        if not nbcs1 is None and not nbcs2 is None and (nbcs1 > max_bc_cov
                                                        or nbcs2 > max_bc_cov):
            filter_strs[i] = tk_sv_io.update_filters(filter_strs[i],
                                                     'HIGH_BC_COV', None)

        filter_strs[i] = tk_sv_io.update_filters(filter_strs[i],
                                                 'READ_SUPPORT',
                                                 npairs + nsplit,
                                                 min_val=args.min_read_support)

        match_str = ','.join([str(s) for s in pred_to_match.get(name, '.')])

        if not args.targets is None:
            # Disable orientation reporting in exome
            info_strs[i] = tk_sv_io.update_info(info_strs[i], ['ORIENT'],
                                                [None])

        info_strs[i] = tk_sv_io.update_info(info_strs[i], [
            'BLACK_DIST1', 'BLACK_DIST2', 'BLACK_FRAC', 'BLACK1', 'BLACK2',
            'MATCHES', 'SEG_DUP'
        ], [
            black_dist1, black_dist2, black_frac, bname1, bname2, match_str,
            seg_dup_match
        ])

    pred_df['filters'] = filter_strs
    pred_df['info'] = info_strs
    pred_df['read_support'] = rps

    return pred_df, min_call_qv
Ejemplo n.º 2
0
def join(args, outs, chunk_defs, chunk_outs):
    pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs)

    # Change TRANS type to DISTAL. This change will only
    # affect the type reported not the names of the metrics.
    new_info = []
    for _, row in pred_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        if sv_type == 'TRANS':
            sv_type = 'DISTAL'
        new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type]))
    pred_df['info'] = new_info

    if not true_df is None:
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')

    ##### Write BEDPE/VCF outputs
    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates)
    source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format(
        martian.get_pipelines_version())
    sample_id = 'sample' if args.sample_id is None else args.sample_id
    tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id,
                          source_str, args.reference_path)
    # this will sort and gzip
    tk_sv_io.index_sv_vcf(outs.svs.strip(".gz"))
    outs.svs_index = outs.svs + '.tbi'
    # delete the non-gzipped file
    os.remove(outs.svs.strip('.gz'))

    if not pred_df.empty:
        call_df = pred_df[np.logical_or(pred_df['filters'] == '.',
                                        pred_df['filters'] == "PASS")]
    else:
        call_df = None
    tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls)

    # Annotate each call with the matching ground truth svs. The resulting
    # dataframe might have multiple rows for the same call if there were multiple
    # matching ground truth svs.
    martian.log_info("merging calls and gt")
    if not pred_df.empty:
        pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match)

    martian.log_info("writing call_tsv")
    pred_df.to_csv(outs.call_tsv,
                   index=False,
                   header=True,
                   sep='\t',
                   na_rep='NaN')

    pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))]

    max_dists = sorted(np.array(args.detect_dists))

    gt_sv_types = get_all_sv_types(true_df)
    call_sv_types = get_all_sv_types(pred_df)

    if not true_df is None:
        # Use the default MAX_PPV_TIER unless this is greater than the maximum tier
        # present in the data.
        max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier))
        # Use the default unless this is smaller than the minimum tier present in
        # the data.
        max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier))
    else:
        max_ppv_tier = 1
        max_sens_tier = 1

    tiers = [max_ppv_tier, max_sens_tier]

    # All combinations of filters in ground truth and call set
    if not args.targets is None and not args.target_dists is None:
        target_dists = list(sorted(np.array(args.target_dists,
                                            dtype=np.float)))
        target_dists.append(float('NaN'))
    else:
        target_dists = [float('NaN')]

    combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers,
                    [True, False], call_sv_types, max_dists)

    metrics = defaultdict(list)

    gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier']
    call_filters = ['call_filtered', 'call_sv_type', 'match_dist']

    for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type,
         dist) in combs:
        if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type:
            continue

        metrics['genic_breaks'].append(genic_breaks)
        metrics['target_dist'].append(tdist)
        metrics['gt_sv_type'].append(gt_sv_type)
        metrics['tier'].append(tier)
        metrics['call_filtered'].append(is_filtered)
        metrics['call_sv_type'].append(call_sv_type)
        metrics['match_dist'].append(dist)

        if true_df is None:
            sel_true_df = None
        else:
            sel_true_df = true_df
            if gt_sv_type != 'NA':
                sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type]
            if not np.isnan(tdist):
                sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist]
            sel_true_df = sel_true_df[sel_true_df.tier <= tier]
            # Restrict to genic or non-genic or take everything if this is None.
            if not genic_breaks is None:
                sel_true_df = sel_true_df[sel_true_df.genic_breaks ==
                                          genic_breaks]

            if len(sel_true_df) == 0:
                sel_true_df = None

        sel_pred_df = pred_df

        if is_filtered and not pred_df.empty:
            sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') |
                                      (sel_pred_df.filters == 'PASS')]
        if call_sv_type != 'NA' and not pred_df.empty:
            sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type]
        if not pred_df.empty and (args.min_rel_overlap is None
                                  or args.min_rel_overlap == 0):
            # Do not apply thi filter if the matching is done based on overlap.
            sel_pred_df = sel_pred_df[np.logical_or(
                np.isnan(sel_pred_df.match_dist),
                sel_pred_df.match_dist <= dist)]

        add_metrics(sel_pred_df, sel_true_df, metrics)

    column_names = gt_filters
    column_names.extend(call_filters)
    other_names = set(metrics.keys()).difference(set(column_names))
    column_names.extend(other_names)

    metric_df = pd.DataFrame(metrics)
    metric_df = metric_df[column_names]

    martian.log_info("writing summary tsv")
    metric_df.to_csv(outs.summary_tsv,
                     index=False,
                     header=True,
                     sep='\t',
                     na_rep='NaN')

    short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier,
                                      max_sens_tier, args)

    if not args.call_summary is None:
        with open(args.call_summary, 'r') as in_summary_fn:
            in_summary = json.load(in_summary_fn)
            for key, val in in_summary.iteritems():
                short_metrics[key] = val

    short_metrics['min_qv'] = min_qv

    with open(outs.summary, 'w') as out_file:
        out_file.write(
            tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
Ejemplo n.º 3
0
def merge_breaks(bedpe_df,
                 out_bedpe,
                 merge_win=10000,
                 max_range=np.inf,
                 max_nmates=np.inf,
                 cluster_qual_factor=0.2):
    """Merges a set of SVs into a non-redundant set.
    Args:
    - bedpe_df: Either a bedpe file or a DataFrame like the one returned by
    tk_sv_io.read_sv_bedpe_to_df.
    - out_bedpe: Path to file where output will be written.
    - merge_win: Breakpoints will be merged if they are within this distance from
    each other. Two SVs will be merged if both their breakpoints can be merged.
    - max_range: See max_range field of cluster_loci.
    - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1,
    and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so
    breakpoint clusters), of the first breakpoint of an SV.
    SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the
    output.

    Return value:
    The output BEDPE.
    """
    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)
    breaks = []
    for i in range(bedpe_df.shape[0]):
        breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1],
                       bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1)))
        breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4],
                       bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = {}
    for i in range(bedpe_df.shape[0]):
        name = bedpe_df.iloc[i]['name']
        cluster_idx1 = mem_to_cluster[(name, 1)]
        cluster_idx2 = mem_to_cluster[(name, 2)]
        if not (cluster_idx1, cluster_idx2) in cluster_pairs:
            cluster_pairs[(cluster_idx1, cluster_idx2)] = [i]
        else:
            old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0]
            # Make sure the old and the new pair have breaks on the same chromosomes
            assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0])
            assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3])
            cluster_pairs[(cluster_idx1, cluster_idx2)].append(i)

    new_cluster_pairs = {}
    cluster_dist_ratio = {}
    for p, pos_list in cluster_pairs.iteritems():
        pos_arr = np.array(pos_list)
        tmp_df = get_dataframe_loc(bedpe_df, pos_arr)
        quals = np.array(tmp_df.qual)
        best_call = pos_arr[np.argmax(quals)]
        close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0]
        close_df = get_dataframe_loc(tmp_df, close_calls)

        same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[
            best_call]['chrom1']
        min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1)
        max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[
            best_call]['stop1']

        new_cluster_pairs[p] = best_call
        if not same_chrom or max_break_dist > MAX_FRAG_SIZE:
            cluster_dist_ratio[p] = '.'
        elif min_break_dist <= 0:
            cluster_dist_ratio[p] = float('NaN')
        else:
            cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist

    cluster_pairs = new_cluster_pairs

    def clusters_close(i, j):
        chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[
            i, 1], bedpe_df.iloc[i, 2]
        chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[
            i, 4], bedpe_df.iloc[i, 5]
        next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[
            j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2]
        next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[
            j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5]
        dist1 = max(next_start1 - stop1, start1 - next_stop1)
        dist2 = max(next_start2 - stop2, start2 - next_stop2)
        return (chrom1 == next_chrom1 and chrom2 == next_chrom2
                and dist1 <= merge_win and dist2 <= merge_win)

    # The "chain-breaking" in cluster_loci might still leave some redundancy.
    # In particular, we might leave some almost touching clusters that were
    # separated only because of chain-breaking. Do a second round of clustering
    # where you go through consecutive pairs of cluster and merge them if they're merge-able.
    new_cluster_pairs = {}
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        if cluster_pairs[(cluster1, cluster2)] == -1:
            continue
        # Consider all neighboring clusters after this cluster.
        # Notice that the cluster indices are sorted by genomic coordinates.
        neigh_clusters = [
            (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1),
            (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1)
        ]
        idx = cluster_pairs[(cluster1, cluster2)]
        # Best cluster among neighboring clusters
        max_cluster = ((cluster1, cluster2), idx)
        for next_cluster1, next_cluster2 in neigh_clusters:
            if not (next_cluster1, next_cluster2) in cluster_pairs:
                continue
            if cluster_pairs[(next_cluster1, next_cluster2)] == -1:
                continue
            next_idx = cluster_pairs[(next_cluster1, next_cluster2)]
            if clusters_close(idx, next_idx):
                cluster_pairs[(next_cluster1, next_cluster2)] = -1
                if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']:
                    max_cluster = ((next_cluster1, next_cluster2), next_idx)
        new_cluster_pairs[max_cluster[0]] = max_cluster[1]

    cluster_pairs = new_cluster_pairs

    # Now compute the number of mate breakpoints for each cluster
    num_mates = {}
    for (cluster1, cluster2) in cluster_pairs.keys():
        if not cluster1 in num_mates:
            num_mates[cluster1] = 0
        if not cluster2 in num_mates:
            num_mates[cluster2] = 0
        num_mates[cluster1] += 1
        if cluster2 != cluster1:
            num_mates[cluster2] += 1

    sel_loc = []
    new_info_strs = []
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        sv_loc = cluster_pairs[(cluster1, cluster2)]
        if num_mates[cluster1] > max_nmates and num_mates[
                cluster2] > max_nmates:
            continue
        sel_loc.append(sv_loc)
        new_info_strs.append(
            tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'],
                                 ['NMATES1', 'NMATES2', 'RESOLUTION'], [
                                     num_mates[cluster1], num_mates[cluster2],
                                     cluster_dist_ratio[(cluster1, cluster2)]
                                 ]))
    if len(sel_loc) > 0:
        bedpe_df = bedpe_df.iloc[sel_loc]
        bedpe_df['info'] = new_info_strs
    else:
        bedpe_df = pd.DataFrame(columns=bedpe_df.columns)
    if not out_bedpe is None:
        tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe)

    return bedpe_df
Ejemplo n.º 4
0
def compare_multiple_breaks(in_bedpes,
                            sample_names,
                            out_bedpe,
                            merge_win=0,
                            max_range=np.inf):
    """Compares multiple BEDPE files.
    Args:
    - in_bedpes: A list of BEDPE files to compare.
    - sample_names: A list of the same size with unique names for the input samples.
    - out_bedpe: Where union BEDPE will be written.

    Return value:
    A DataFrame with the union of calls and information about which
    calls are present in which input files. This DataFrame will have one entry per call in the
    union and will include (among other things) columns <sample>_qual, <sample>_filtered,
    <sample>_correct, and <sample>_dist for each of the input BEDPEs.
    """

    assert (len(sample_names) == len(in_bedpes))

    # Merge all the input files. This will get rid of redundant entries.
    # The quality in the output will be the maximum quality across all files.
    merged_df = merge_multiple_breaks(in_bedpes,
                                      out_bedpe,
                                      merge_win=merge_win,
                                      max_range=max_range)
    num_merged = len(merged_df)

    # Map the name of each entry in the union to its index in the DataFrame.
    name_to_ind = {}
    for i, n in enumerate(merged_df['name']):
        name_to_ind[n] = i

    new_filters = [set([]) for i in range(num_merged)]
    new_matches = [set([]) for i in range(num_merged)]

    # For each of the input BEDPEs find which of the entries in the union it
    # overlaps. This is somewhat duplicated work, but it's simpler this way.
    for sample, bedpe in zip(sample_names, in_bedpes):
        in_df = tk_sv_io.read_sv_bedpe_to_df(bedpe)
        name_to_ind2 = {}
        for i, n in enumerate(in_df['name']):
            name_to_ind2[n] = i

        matched_qual = np.zeros((num_merged, ), dtype=np.int)
        is_correct = np.zeros((num_merged, ), dtype=np.bool)
        is_filtered = np.zeros((num_merged, ), dtype=np.bool)
        tmp_dist = np.zeros((num_merged, ), dtype=np.int)
        matched_names = ['' for i in range(num_merged)]

        # merged_to_this will be a dictionary from a name in the union to a set
        # of names in the input bedpe
        merged_to_this, _, _ = compare_breaks(merged_df,
                                              bedpe,
                                              max_dist=merge_win)
        for name1, name2_set in merged_to_this.iteritems():
            ind1 = name_to_ind[name1]
            matched_names[ind1] = ';'.join([str(s) for s in name2_set])
            for name2 in name2_set:
                ind2 = name_to_ind2[name2]
                matched_qual[ind1] = max(matched_qual[ind1],
                                         in_df.iloc[ind2]['qual'])
                match = tk_sv_io.extract_sv_info(in_df.iloc[ind2]['info'],
                                                 ['MATCHES'])[0]
                is_match_correct = (match != '.' and match != ''
                                    and not match is None)
                if is_match_correct:
                    new_matches[ind1].add(match)
                    # Never set back to False if it was set to true.
                    is_correct[ind1] = True
                is_filtered[ind1] = in_df.iloc[ind2]['filters'] != '.'
                if in_df.iloc[ind2]['filters'] != '.':
                    new_filters[ind1] = new_filters[ind1].union(
                        set(in_df.iloc[ind2]['filters'].split(';')))
                if in_df.iloc[ind2]['chrom1'] != in_df.iloc[ind2]['chrom2']:
                    tmp_dist[ind1] = -1
                else:
                    tmp_dist[ind1] = in_df.iloc[ind2]['start2'] - in_df.iloc[
                        ind2]['stop1']

        merged_df[str(sample) + '_matches'] = matched_names
        merged_df[str(sample) + '_qual'] = matched_qual
        merged_df[str(sample) + '_correct'] = is_correct
        merged_df[str(sample) + '_filtered'] = is_filtered
        merged_df[str(sample) + '_dist'] = tmp_dist

    info_strs = ['.' for i in range(num_merged)]
    filter_strs = ['.' for i in range(num_merged)]
    for i in range(num_merged):
        match_str = ','.join(
            new_matches[i]) if len(new_matches[i]) > 0 else '.'
        info_strs[i] = tk_sv_io.update_info('.', ['MATCHES'], [match_str])
        filter_strs[i] = ';'.join(
            new_filters[i]) if len(new_filters[i]) > 0 else '.'

    merged_df['qual'] = np.array(np.max(
        merged_df[[str(s) + '_qual' for s in sample_names]], axis=1),
                                 dtype=np.int)
    merged_df['filters'] = filter_strs
    merged_df['info'] = info_strs
    merged_df.sort(
        ['qual', 'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
        ascending=[0, 1, 1, 1, 1, 1, 1],
        inplace=True)

    return merged_df
Ejemplo n.º 5
0
def main(args, outs):

    bedpe_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    bedpe_df = tk_sv_utils.get_dataframe_loc(
        bedpe_df, list(range(int(args.start_idx), int(args.stop_idx))))

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(
        args.insert_sizes)
    if max_insert is None:
        martian.throw('No Q60 reads')

    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary[
        'same_dir_chimera_rate']
    chimera_rate_trans = summary['far_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary[
        'outward_dir_chimera_rate']

    chimera_rates = {
        tk_readpairs.DEL_STR: chimera_rate_del,
        tk_readpairs.INV_STR: chimera_rate_inv,
        tk_readpairs.TDUP_STR: chimera_rate_dup,
        tk_readpairs.TRANS_FF_STR: chimera_rate_trans,
        tk_readpairs.TRANS_FR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RR_STR: chimera_rate_trans,
        tk_readpairs.TRANS_RF_STR: chimera_rate_trans
    }

    in_bam = tk_bam.create_bam_infile(args.input)

    out_quals = []
    out_infos = []
    out_chroms1 = []
    out_starts1 = []
    out_stops1 = []
    out_chroms2 = []
    out_starts2 = []
    out_stops2 = []

    for i, (_, row) in enumerate(bedpe_df.iterrows()):
        in_svt = tk_sv_io.get_sv_type(row.info)

        if row.chrom1 == row.chrom2:
            max_ext = min(args.break_extend, int(
                (row.start2 - row.stop1) / 3.0))
            r1 = (max(0, row.start1 - args.break_extend), row.stop1 + max_ext)
            r2 = (max(0, row.start2 - max_ext), row.stop2 + args.break_extend)
            if r1[1] > r2[0]:
                starts = [r1[0]]
                stops = [r2[1]]
                chroms = [row.chrom1]
            else:
                starts = [r1[0], r2[0]]
                stops = [r1[1], r2[1]]
                chroms = [row.chrom1, row.chrom2]
        else:
            r1 = (max(0, row.start1 - args.break_extend),
                  row.stop1 + args.break_extend)
            r2 = (max(0, row.start2 - args.break_extend),
                  row.stop2 + args.break_extend)
            starts = [r1[0], r2[0]]
            stops = [r1[1], r2[1]]
            chroms = [row.chrom1, row.chrom2]

        readpairs = tk_readpairs.get_readpairs2(in_bam,
                                                chroms,
                                                starts,
                                                stops,
                                                max_insert=max_insert,
                                                min_mapq=args.min_mapq)

        # Distal readpairs across the breakpoints
        dist_readpairs = filter(
            filter_fun(in_bam, row.chrom1, row.chrom2, r1, r2), readpairs)

        if len(dist_readpairs) > MAX_READPAIRS:
            sel = numpy.random.choice(len(dist_readpairs), MAX_READPAIRS)
        else:
            sel = np.arange(len(dist_readpairs))
        dist_readpairs = [dist_readpairs[ridx] for ridx in sel]

        res_arr = tk_readpairs.get_sv_lr(dist_readpairs, ins_logsf_fun,
                                         max_insert, chimera_rates)

        if len(res_arr) == 0:
            out_quals.append(row.qual)
            out_chroms1.append(row.chrom1)
            out_starts1.append(row.start1)
            out_stops1.append(row.stop1)
            out_chroms2.append(row.chrom2)
            out_starts2.append(row.start2)
            out_stops2.append(row.stop2)
            out_infos.append(row['info'])
        else:
            if args.best_only:
                res_arr = sorted(res_arr, key=lambda x: x[0], reverse=True)
                res_arr = [res_arr[0]]

            for (lr, num_split, num_pairs, sv_len, support_range, svt,
                 support_readpairs) in res_arr:
                range1, range2 = support_range
                if num_split + num_pairs >= args.min_reads_to_call and lr >= args.min_lr_to_call and not range1 is None and not range2 is None:
                    out_quals.append(row.qual + args.rp_lr_multiplier * lr)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(range1[0])
                    out_stops1.append(range1[1])
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(range2[0])
                    out_stops2.append(range2[1])
                    if svt != in_svt and in_svt != 'TRANS':
                        in_svt = 'UNK'
                else:
                    out_quals.append(row.qual)
                    out_chroms1.append(row.chrom1)
                    out_starts1.append(row.start1)
                    out_stops1.append(row.stop1)
                    out_chroms2.append(row.chrom2)
                    out_starts2.append(row.start2)
                    out_stops2.append(row.stop2)

                out_infos.append(
                    tk_sv_io.update_info(
                        row['info'],
                        ['NPAIRS', 'NSPLIT', 'RP_LR', 'RP_TYPE', 'TYPE'],
                        [num_pairs, num_split, lr, svt, in_svt]))

    in_bam.close()

    if args.best_only:
        out_names = [n for n in bedpe_df['name']]
    else:
        out_names = np.arange(len(bedpe_df))

    out_df = tk_sv_io.create_sv_df(out_chroms1, out_starts1, out_stops1,
                                   out_chroms2, out_starts2, out_stops2,
                                   out_names, out_quals, out_infos)
    tk_sv_io.write_sv_df_to_bedpe(out_df, outs.sv_variants)