Esempio n. 1
0
def get_hap_coverage(in_bam, ps_h5, chrom, start, stop, cov_quals):
    """Return a dataframe with coverage per haplotype.

    Args:
    - in_bam: reader for a position sorted bam
    - ps_h5: HDF5 with phase set coordinates
    - chrom, start, stop: region to get coverage
    - cov_quals: Array of MAPQ cutoffs.

    Return value:
    A dataframe with columns:
    - chrom
    - pos
    - cov_q<M>_hap<H> for all M in cov_quals and for H in [0, 1, 2]: This is the
    coverage on haplotype H using reads of MAPQ >= M. Haplotype 2 corresponds to
    unphased.
    - phase_set: null if ps_h5 is missing.
    """
    coverages = [np.zeros((stop - start, 3)) for _ in cov_quals]

    for _, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))):
        if not read.is_unmapped and not read.aend is None and not read.is_secondary and not read.is_duplicate:
            hap = tk_io.get_read_haplotype(read)
            hap_idx = 2 if hap is None else hap - 1
            range_start = max(0, read.pos - start)
            range_stop = min(stop, read.aend) - start
            for qi, q in enumerate(cov_quals):
                if read.mapq >= q:
                    coverages[qi][range_start:range_stop + 1, hap_idx] += 1

    base_df = pd.DataFrame({'chrom': chrom, 'pos': np.arange(start, stop)})
    dfs = map(
        lambda x: pd.DataFrame(
            x[0],
            columns=['cov_q' + str(x[1]) + '_hap' + str(i) for i in range(3)]),
        zip(coverages, cov_quals))
    df = pd.concat([base_df, pd.concat(dfs, axis=1)], axis=1)

    phase_sets = -np.ones((stop - start, ), dtype=np.int)

    # This can be None if for example the input is unbarcoded.
    if not ps_h5 is None:
        ps_df = tk_hdf5.read_data_frame(ps_h5)
        ps_df = ps_df[np.logical_and(
            ps_df.chrom == chrom,
            np.logical_and(ps_df.end >= start, ps_df.start < stop))]

        for _, row in ps_df.iterrows():
            range_start = max(0, row.start - start)
            range_stop = min(stop, row.end) - start
            phase_sets[range_start:range_stop + 1] = row.phase_set

    df['phase_set'] = phase_sets
    return df
Esempio n. 2
0
def _get_sliced_df(h5file, column_names, row_slices, id_column=None):
    columns = [(name, h5file[name], get_levels(h5file[name]))
               for name in column_names]

    result_cols = {}

    for (name, ds, translate) in columns:
        if len(row_slices) > 0:
            row_slices.sort()
            rows = np.concatenate(
                [ds[start:end] for (start, end) in row_slices])
        else:
            # we'll return an empty data frame if there are no slices
            # np.concatenate fail with 0-length input
            rows = np.array([], dtype=ds.dtype)
        if translate is not None:
            rows = translate[rows]

        result_cols[name] = rows

    if len(row_slices) > 0:
        id_column_values = np.concatenate(
            [np.arange(start, end) for (start, end) in row_slices])
    else:
        id_column_values = np.array([], dtype=np.int32)

    if id_column is not None:
        result_cols[id_column] = id_column_values

    df = p.DataFrame(result_cols)
    df.index = id_column_values

    return df
Esempio n. 3
0
def read_data_frame_limited(fn, query_cols=[], max_rows=None):
    ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns '''

    with h5py.File(fn, 'r') as f:

        column_names = f.attrs.get("column_names")
        column_names = get_column_intersection(column_names, query_cols)

        sz = f[column_names[0]].shape[0]
        if max_rows:
            sz = min(sz, max_rows)

        df = p.DataFrame()

        # Add the columns progressively to save memory
        for name in column_names:
            ds = f[name]
            if has_levels(ds):
                indices = ds[:sz]
                uniques = get_levels(ds)
                # This method of constructing of Categorical avoids copying the indices array
                # which saves memory for big datasets
                df[name] = p.Categorical(indices,
                                         categories=uniques,
                                         ordered=False,
                                         fastpath=True)
            else:
                df[name] = p.Series(ds[:sz])

        return df
Esempio n. 4
0
def make_df_chunk(fragments, bcs):
    # No BC results -- will write an empty file
    if len(bcs) == 0:
        return (None, None)
    else:
        fragment_df = p.DataFrame(fragments)
        bc_df = p.DataFrame(bcs)

        # Set good types for the fragment data frame to reduce size
        fragment_df.start_pos = fragment_df.start_pos.astype(np.int32)
        fragment_df.end_pos = fragment_df.end_pos.astype(np.int32)
        fragment_df.obs_len = fragment_df.obs_len.astype(np.int32)
        fragment_df.est_len = fragment_df.est_len.astype(np.int32)
        fragment_df.num_reads = fragment_df.num_reads.astype(np.int32)
        fragment_df.num_reads_se = fragment_df.num_reads_se.astype(np.int32)
        fragment_df.bc_num_reads = fragment_df.bc_num_reads.astype(np.int32)
        fragment_df.bc_est_len = fragment_df.bc_est_len.astype(np.int32)
        fragment_df.bc_mean_reads_per_fragment = fragment_df.bc_mean_reads_per_fragment.astype(
            np.float32)

        return (fragment_df, bc_df)
Esempio n. 5
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    chrom = args.chrom

    poses = []
    mol_qs = []
    bcs = []

    for read in in_bam.fetch(str(chrom), int(args.start_pos),
                             int(args.end_pos)):
        if not read.is_secondary and not read.is_duplicate and read.is_read1 and \
            not read.is_unmapped and read.mapq >= args.mapq:
            poses.append(read.pos)
            mol_qs.append(tk_io.get_read_molecule_conf(read))
            bcs.append(tk_io.get_read_barcode(read))
    ret_df = pd.DataFrame({
        'chrom': chrom,
        'pos': poses,
        'bc': bcs,
        'mol_qual': mol_qs
    })

    if len(ret_df) > 0:
        start_pos = poses[0]
        end_pos = poses[-1]
        cov_df = tk_hdf5.read_data_frame_indexed(
            args.coverage, [(chrom, start_pos, end_pos + 1)])

        # Boolean array with length equal to the range of positions in ret_df
        on_target = np.zeros((end_pos - start_pos + 1, ), dtype=np.bool)
        on_target[cov_df.pos - start_pos] = True
        cum_on_target = np.cumsum(on_target)

        ret_df['on_target'] = on_target[ret_df.pos - start_pos]
        # Note that fraction is set to 1 if there are no bases
        frac_on_target = np.ones((len(ret_df), )) * on_target[0]
        for i, p in enumerate(poses):
            if i > 0:
                nbp = float(p - poses[i - 1] - 1)
                if nbp > 0:
                    frac_on_target[i] = (
                        cum_on_target[p - start_pos] -
                        cum_on_target[poses[i - 1] - start_pos] -
                        int(on_target[p - start_pos])) / nbp
                else:
                    frac_on_target[i] = float(on_target[p - start_pos])
        ret_df['frac_on_target'] = frac_on_target
    else:
        ret_df['on_target'] = False
        ret_df['frac_on_target'] = 0.0
    tk_hdf5.write_data_frame(outs.reads, ret_df)
Esempio n. 6
0
def get_reads(in_bam, chrom, start, stop, in_read_df=None,
              min_mapq=30, max_reads=500000, blacklist_barcodes=None):
    poses = []
    ends = []
    bcs = []

    if not in_read_df is None and len(in_read_df) > 0:
        ret_df = in_read_df.sort('pos')
        old_poses = np.array(ret_df['pos'])
        # Subtracting the read length is roughly right, ideally we should sort
        # by aend.
        # Loci are considered in an ordered fashion, so we should never fetch
        # reads "earlier" in the bam.
        start = max(old_poses[0], max(0, start - MAX_READ_LEN))
        if start >= old_poses[0] and start <= old_poses[-1]:
            start_idx = bisect.bisect_left(old_poses, start)
            if stop >= old_poses[0] and stop <= old_poses[-1]:
                stop_idx = min(len(ret_df), bisect.bisect(old_poses, stop))
            else:
                stop_idx = len(ret_df)
            # Remove all positions that are smaller than the input start
            ret_df = ret_df.iloc[start_idx:stop_idx]
            # Set the new start to the end of the input data frame.
            # Add an overlap of READ_LEN to capture reads that were right on
            # the boundary between the old and new data frame.
            start = max(0, old_poses[stop_idx - 1] - MAX_READ_LEN)
            stop = max(start, stop)
    else:
        ret_df = None

    for i, read in enumerate(in_bam.fetch(str(chrom), int(start), int(stop))):
        if i > max_reads:
            break
        bc = tk_io.get_read_barcode(read)
        if read.pos < start:
            continue
        if not blacklist_barcodes is None and bc in blacklist_barcodes:
            continue
        if not read.is_secondary and not read.is_duplicate and read.is_read1 and \
           not read.is_unmapped and read.mapq >= min_mapq and read.is_proper_pair and \
           not bc is None:
            poses.append(read.pos)
            ends.append(read.aend)
            bcs.append(tk_io.get_read_barcode(read))

    tmp_ret_df = pd.DataFrame({'chrom':chrom, 'pos':poses, 'aend':ends, 'bc':bcs})

    ret_df = pd.concat([ret_df, tmp_ret_df], ignore_index=True)
    ret_df.sort(['bc', 'pos'], inplace=True)
    return ret_df
Esempio n. 7
0
def merge_calls_and_gt(call_df, gt_df, call_to_gt):

    if not gt_df is None:
        gt_df.index = gt_df['name']
    else:
        call_to_gt = {}

    out_call_df = None
    for _, row in call_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        orient = tk_sv_io.get_break_orientation(row.info)
        row['orient'] = orient

        # revert sv type name from DISTAL to TRANS to match ground truth
        # conventions
        if sv_type == 'DISTAL':
            sv_type = 'TRANS'
        row['sv_type'] = sv_type

        matches = list(call_to_gt.get(row['name'], [None]))
        # One output row per match
        for m in matches:
            row['match'] = m
            if not m is None and not gt_df is None:
                x = gt_df.loc[m]
                row['match_dist'] = max(
                    dist_to_breaks(int((row.start1 + row.stop1) / 2), x.start1,
                                   x.stop1),
                    dist_to_breaks(int((row.start2 + row.stop2) / 2), x.start2,
                                   x.stop2))
            else:
                row['match_dist'] = float('NaN')

            out_call_df = pd.concat(
                [out_call_df, pd.DataFrame([row])], ignore_index=True)

    if not gt_df is None:
        out_call_df = pd.merge(out_call_df,
                               gt_df,
                               left_on='match',
                               right_on='name',
                               how='outer',
                               suffixes=['', '_gt'])
        out_call_df.drop(['filters_gt', 'dist'], axis=1, inplace=True)
    out_call_df.sort('name', inplace=True)

    return out_call_df
Esempio n. 8
0
def join(args, outs, chunk_defs, chunk_outs):
    # Combine the coverage hdf5 files
    frame = p.DataFrame()
    list_ = []
    if args.baits_file_map and outs.bait_csv:
        in_files = [
            out.bait_csv for (cdef, out) in zip(chunk_defs, chunk_outs)
        ]
        for file_ in in_files:
            df = p.read_csv(file_, index_col=None, header=0)
            list_.append(df)
        frame = p.concat(list_)

        # write csv
        frame.to_csv(outs.bait_csv)
    else:
        outs.target_coverage = None
Esempio n. 9
0
def get_reads(in_bam, chrom, start, stop, min_mapq=60):
    bcs = []
    poses = []

    for read in in_bam.fetch(str(chrom), start, stop):
        mapq = read.mapq
        if mapq < min_mapq or read.is_secondary or read.is_duplicate:
            continue
        bc = tk_io.get_read_barcode(read)
        if bc is None:
            continue

        bcs.append(bc)
        poses.append(read.pos)
    df = pd.DataFrame({'pos': poses, 'bc': bcs})
    df.sort('bc', inplace=True)
    return df
Esempio n. 10
0
def join(args, outs, chunk_defs, chunk_outs):
    out_bedpe = None
    for c in chunk_outs:
        if not os.path.isfile(c.sv_variants):
            continue
        in_bedpe = tk_sv_io.read_sv_bedpe_to_df(c.sv_variants)
        if not in_bedpe is None:
            out_bedpe = pd.concat([out_bedpe, in_bedpe], ignore_index=True)

    if out_bedpe is None:
        col_names = ['chrom1', 'start1', 'stop1',
                     'chrom2', 'start2', 'stop2', 'name', 'qual',
                     'strand1', 'strand2', 'filters', 'info']
        out_bedpe = pd.DataFrame(columns=col_names)
    out_bedpe.names = np.arange(len(out_bedpe))

    out_bedpe = out_bedpe[out_bedpe.qual >= args.sv_min_qv]
    tk_sv_io.write_sv_df_to_bedpe(out_bedpe, outs.sv_variants)
Esempio n. 11
0
def read_data_frame_filtered(fn,
                             filter_func,
                             query_cols=[],
                             chunk_size=5000000):
    ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns.
        filter_func should take a DataFrame and return a boolean vector of the rows to keep.
        Rows are loaded from the file and filtered in chunks to keep peak memory usage small. '''

    f = h5py.File(fn, 'r')

    column_names = f.attrs.get("column_names")
    column_names = get_column_intersection(column_names, query_cols)
    column_index = p.Index(column_names)

    sz = f[column_names[0]].shape[0]
    starts = np.arange(0, sz, chunk_size)
    ends = np.minimum(sz, starts + chunk_size)

    chunks = []

    for (start, end) in zip(starts, ends):
        cols = {}
        for name in column_names:
            ds = f[name]
            if has_levels(ds):
                indices = ds[start:end]
                uniques = get_levels(ds)
                col = uniques[indices]
            else:
                col = ds[start:end]

            cols[name] = col
        df = p.DataFrame(cols, columns=column_index)
        df = df[filter_func(df)]

        if len(df) > 0 or len(chunks) == 0:
            chunks.append(df)

    f.close()

    result = p.concat(chunks, ignore_index=True)
    return result
Esempio n. 12
0
def get_break_groups(bedpe_df, merge_win=10000, max_range=np.inf):
    """A simplified version of merge_breaks"""

    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)

    breaks = []
    for i, (n, row) in enumerate(bedpe_df.iterrows()):
        breaks.append((row.chrom1, row.start1, row.stop1, (n, 1)))
        breaks.append((row.chrom2, row.start2, row.stop2, (n, 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = defaultdict(list)
    for i, (n, row) in enumerate(bedpe_df.iterrows()):
        cluster_idx1 = mem_to_cluster[(n, 1)]
        cluster_idx2 = mem_to_cluster[(n, 2)]
        cluster_pairs[(cluster_idx1, cluster_idx2)].append(n)
    return cluster_pairs.values()
Esempio n. 13
0
def get_depth_info(read_iter, chrom, cstart, cend):

    depths = np.zeros(cend - cstart, np.int32)

    for read in read_iter:
        pos = read.pos
        rstart = max(pos, cstart)

        # Increment to the end of the window or the end of the
        # alignment, whichever comes first
        rend = min(read.aend, cend)
        depths[(rstart - cstart):(rend - cstart)] += 1

    positions = np.arange(cstart, cend, dtype=np.int32)

    depth_df = pd.DataFrame({
        "chrom": chrom,
        "pos": positions,
        "coverage": depths
    })
    return depth_df
Esempio n. 14
0
def main(args, outs):

    sv_df = tk_sv_io.read_sv_bedpe_to_df(args.sv_variants)
    sv_df["info2"] = "SV"

    cnv_df = tk_sv_io.read_sv_bedpe_to_df(args.cnv_variants)
    cnv_df["info2"] = "CNV"

    sv_df = pd.concat([sv_df, cnv_df], ignore_index=True)
    sv_df['name'] = np.arange(len(sv_df))
    sv_df.sort(['chrom1', 'chrom2'], inplace=True)

    res_df = None
    for _, tmp_df in sv_df.groupby(['chrom1', 'chrom2']):
        tmp_df.sort(['chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2'],
                    inplace=True)
        # cluster the loci in the group based on proximity
        groups = tk_sv_utils.get_break_groups(tmp_df, args.max_dist)

        # for each cluster, get the row with max qual
        # tmp_df.loc[g] gets the subset of tmp_df in the cluster.
        # then idxmax gets the max index

        out_df = pd.DataFrame(columns=sv_df.columns)
        idx = 0
        for g in groups:
            row = tmp_df.loc[tmp_df.loc[g]['qual'].idxmax()]
            if (tmp_df.loc[g]['info2'] == 'SV').any():
                row = tmp_df.loc[(tmp_df.loc[g]['info2'] == 'SV').idxmax()]

            source = list(set(tmp_df.loc[g]['info2']))
            row['info'] += (";SOURCE=" + ",".join(source))
            out_df.loc[idx] = row
            idx += 1

        out_df.sort(['start1', 'stop1', 'start2', 'stop2'], inplace=True)
        res_df = pd.concat([res_df, out_df], ignore_index=True)

    tk_sv_io.write_sv_df_to_bedpe(res_df, outs.sv_variants)
    def test_basic(self):
        self.clear_directory()
        self.run_stage(self.args, "chr1:0..10000000")

        # Load all the output files
        variants = tenkit.hdf5.read_data_frame(os.path.join(job_dir, "variants.h5"))
        genes = p.read_csv(os.path.join(job_dir, "gene_stats.csv"))

        with open(os.path.join(job_dir, "summary.json")) as summary_file:
            summary = json.load(summary_file)

        print variants.head()

        # Check basic phasing metrics (verified with independent code)
        self.assertEqual(summary['N50_phase_block'], 362897)
        self.assertEqual(summary['longest_phase_block'], 1588263)
        np.testing.assert_almost_equal(summary['mean_phase_block'], 7067.99817851)

        # Hand calculated snp-weighted probabilities
        np.testing.assert_almost_equal(summary['fract_genes_phased'], 0.44155844155844154, decimal = 10)
        np.testing.assert_almost_equal(summary['fract_genes_completely_phased'], 0.1038961038961039, decimal = 10)
        self.assertEqual(summary['prob_snp_phased_in_gene'], 0.36608192963189135)
        self.assertEqual(summary['prob_snp_correct_in_gene'], 0.950884655460024)

        # Hand test some variant metrics
        gene_name = "KIAA1751"
        gene = genes[genes.gene == gene_name]

        self.assertEqual(gene.pair_phased_rate.values[0], float(79*78)/(143*142))
        np.testing.assert_almost_equal(gene.pair_correct_rate.values[0], float(77*76/2)/(78*77/2))

        self.assertEqual(gene.start.values[0], 1884751)
        self.assertEqual(gene.end.values[0], 1935276)


        # Hand test some variant metrics
        gene_name = "AJAP1"
        gene = genes[genes.gene == gene_name]

        np.testing.assert_almost_equal(gene.pair_phased_rate.values[0], float(149*148)/(184*183))
        np.testing.assert_almost_equal(gene.pair_correct_rate.values[0], float((143*142)/2 + 1) / (145*144/2))

        self.assertEqual(gene.start.values[0], 4715104)
        self.assertEqual(gene.end.values[0], 4843851)

        sample_variant_df = p.DataFrame({'in_obs':[ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ],
                                        'in_gt':  [ 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 ],
                                   'variant_type':['S','S','D','I','S','C','D','I','D','S','I','I','D','S','C','D'],
                                    'FILTER':     [ 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 ],
                                'variant_length': [ 0 , 0 , -1, 1 , 0 , 1 , -1, 1 , -6, 0 , 6 , 7 , 5 , 0 , 6 , 1 ]})

        stats = {}

        filter_states = ["unfiltered", "filtered"]
        var_types = ["any", "snp", "insertion", "deletion", "complex", "insertion_lt5bp", "deletion_lt5bp"]

        for filter_condition in filter_states:
            for var_type in var_types:
                compute_snpindel_stats(stats, filter_condition, var_type, None, sample_variant_df, True)

        print stats
        #not covering all, but covering each dimension at least once
        self.assertEqual(stats['tps_filtered'], 2)
        self.assertEqual(stats['tps_unfiltered'], 4)
        self.assertEqual(stats['tps_filtered_snp'], 1)
        self.assertEqual(stats['tps_filtered_deletion'], 1)
        self.assertEqual(stats['tps_filtered_complex'], 0)
        self.assertEqual(stats['sensitivity_unfiltered'], 0.5)
        self.assertEqual(stats['ppv_unfiltered'], 0.5)
        self.assertEqual(stats['sensitivity_filtered'], 0.25)
        self.assertEqual(stats['ppv_filtered'], 0.5)
        self.assertEqual(stats['fps_filtered'], 2)
        self.assertEqual(stats['fns_unfiltered_insertion_lt5bp'], 1)
        self.assertEqual(stats['tps_filtered_deletion_lt5bp'], 1)
        self.assertEqual(stats['ppv_unfiltered_snp'], 0.5)
        self.assertEqual(stats['sensitivity_unfiltered_complex'], 0.5)
        self.assertEqual(stats['fns_unfiltered'], 4)
        self.assertEqual(stats['fns_filtered_deletion_lt5bp'], 1)

        # Test that some blocks of short switches are caught correctly
        ss = variants[variants.pos.isin([1529457,1529511])]
        self.assertTrue(ss.short_switch.all())

        # Used to be a long switch - now should be nothinh
        non_long = variants[variants.pos == 1529950]
        self.assertTrue((non_long.short_switch == False).all())
        self.assertTrue((non_long.long_switch == False).all())

        switch_fixes = variants[np.logical_and(variants.pos >= 3173228, variants.pos <= 3181098)]
        self.assertTrue((switch_fixes.long_switch == False).all())
        self.assertEqual(switch_fixes.short_switch.sum(), 5)
Esempio n. 16
0
    def coalescence_analysis(self, min_cluster_size=2, fpr=1.0):
        ''' Compute the BC-BC overlap matrix, threshold it and convert to a graph, and report large clusters '''
        # Sizes
        (num_bcs, total_bins) = self.mat.shape

        # number of fragment on each bc
        self.num_frags = np.array((self.mat > 0.0).sum(axis=1)).flatten()
        martian.log_info("mean num frags: %f,  median: %f" %
                         (self.num_frags.mean(), np.median(self.num_frags)))

        # compute an initial threshold for coalescence
        # first, compute the expected number of overlaps between two BCs having the mean number of fragments
        expected_overlaps = (self.num_frags.mean()**
                             2) / self.effective_genome_bins
        # now use a ~5 sigma threshold to represent an initial cutoff for significance
        # note: this is more informative for GemCode than Chromium, because there are many more fragments per BC
        # and thus the expected number of overlaps due to chance is much higher. for Chromium, the threshold will usually be 2.
        overlap_threshold = np.float32(
            max(2, round(expected_overlaps + 5 * np.sqrt(expected_overlaps))))
        martian.log_info(
            "expected overlaps: %f -- using overlap threshold: %f" %
            (expected_overlaps, overlap_threshold))

        # Chunk out matrix in x and y and find significant overlaps in each pair of chunks
        bc_bin_size = 1000
        bc_bins = np.arange(0, num_bcs, bc_bin_size)

        # Choose a p-value that accounts for many comparisons
        n_comparisons = num_bcs**2
        pvalue_cut = fpr / n_comparisons
        martian.log_info("using pvalue cutoff: %e" % pvalue_cut)

        for x in bc_bins:

            martian.log_info("BC number: %d on x axis" % x)
            for y in bc_bins:
                # calculation is symmetric -- don't do below the diagonal
                if y < x:
                    continue

                self.window_intersection_slices(x, y, bc_bin_size, pvalue_cut,
                                                overlap_threshold)

        # Delete the fragment matrix to save memory
        del self.mat
        self.mat = None

        martian.log_info("Finding connected components")
        clusters = self.clusters()
        bad_bcs = []
        cluster_idxs = []
        cluster_size = []

        martian.log_info("Making bad BC DataFrame")
        for (cluster_idx, (_, nodes)) in enumerate(clusters.iteritems()):
            if len(nodes) < min_cluster_size:
                continue
            bad_bcs.extend(nodes)
            cluster_idxs.extend([cluster_idx] * len(nodes))
            cluster_size.extend([len(nodes)] * len(nodes))

        bad_bc_tbl = p.DataFrame({
            'bc': bad_bcs,
            'cluster_id': cluster_idxs,
            'cluster_size': cluster_size
        })
        return bad_bc_tbl
Esempio n. 17
0
def merge_breaks(bedpe_df,
                 out_bedpe,
                 merge_win=10000,
                 max_range=np.inf,
                 max_nmates=np.inf,
                 cluster_qual_factor=0.2):
    """Merges a set of SVs into a non-redundant set.
    Args:
    - bedpe_df: Either a bedpe file or a DataFrame like the one returned by
    tk_sv_io.read_sv_bedpe_to_df.
    - out_bedpe: Path to file where output will be written.
    - merge_win: Breakpoints will be merged if they are within this distance from
    each other. Two SVs will be merged if both their breakpoints can be merged.
    - max_range: See max_range field of cluster_loci.
    - max_nmates: Two extra info fields will be added to the output BEDPE, NMATES1,
    and NMATES2. NMATES1 is the number of mate breakpoints (after merging, so
    breakpoint clusters), of the first breakpoint of an SV.
    SVs whose breakpoints both exceed the max_nmates cutoff will not be included in the
    output.

    Return value:
    The output BEDPE.
    """
    if not isinstance(bedpe_df, pd.DataFrame):
        bedpe_df = tk_sv_io.read_sv_bedpe_to_df(bedpe_df)
    else:
        bedpe_df = pd.DataFrame(bedpe_df)
    breaks = []
    for i in range(bedpe_df.shape[0]):
        breaks.append((bedpe_df.iloc[i, 0], bedpe_df.iloc[i, 1],
                       bedpe_df.iloc[i, 2], (bedpe_df.iloc[i, 6], 1)))
        breaks.append((bedpe_df.iloc[i, 3], bedpe_df.iloc[i, 4],
                       bedpe_df.iloc[i, 5], (bedpe_df.iloc[i, 6], 2)))
    _, mem_to_cluster, _ = cluster_loci(breaks, merge_win, max_range=max_range)

    cluster_pairs = {}
    for i in range(bedpe_df.shape[0]):
        name = bedpe_df.iloc[i]['name']
        cluster_idx1 = mem_to_cluster[(name, 1)]
        cluster_idx2 = mem_to_cluster[(name, 2)]
        if not (cluster_idx1, cluster_idx2) in cluster_pairs:
            cluster_pairs[(cluster_idx1, cluster_idx2)] = [i]
        else:
            old_pair = cluster_pairs[(cluster_idx1, cluster_idx2)][0]
            # Make sure the old and the new pair have breaks on the same chromosomes
            assert (bedpe_df.iloc[old_pair, 0] == bedpe_df.iloc[i, 0])
            assert (bedpe_df.iloc[old_pair, 3] == bedpe_df.iloc[i, 3])
            cluster_pairs[(cluster_idx1, cluster_idx2)].append(i)

    new_cluster_pairs = {}
    cluster_dist_ratio = {}
    for p, pos_list in cluster_pairs.iteritems():
        pos_arr = np.array(pos_list)
        tmp_df = get_dataframe_loc(bedpe_df, pos_arr)
        quals = np.array(tmp_df.qual)
        best_call = pos_arr[np.argmax(quals)]
        close_calls = np.where(quals >= cluster_qual_factor * np.max(quals))[0]
        close_df = get_dataframe_loc(tmp_df, close_calls)

        same_chrom = bedpe_df.iloc[best_call]['chrom2'] == bedpe_df.iloc[
            best_call]['chrom1']
        min_break_dist = np.min(close_df.start2) - np.max(close_df.stop1)
        max_break_dist = bedpe_df.iloc[best_call]['start2'] - bedpe_df.iloc[
            best_call]['stop1']

        new_cluster_pairs[p] = best_call
        if not same_chrom or max_break_dist > MAX_FRAG_SIZE:
            cluster_dist_ratio[p] = '.'
        elif min_break_dist <= 0:
            cluster_dist_ratio[p] = float('NaN')
        else:
            cluster_dist_ratio[p] = float(max_break_dist) / min_break_dist

    cluster_pairs = new_cluster_pairs

    def clusters_close(i, j):
        chrom1, start1, stop1 = bedpe_df.iloc[i, 0], bedpe_df.iloc[
            i, 1], bedpe_df.iloc[i, 2]
        chrom2, start2, stop2 = bedpe_df.iloc[i, 3], bedpe_df.iloc[
            i, 4], bedpe_df.iloc[i, 5]
        next_chrom1, next_start1, next_stop1 = bedpe_df.iloc[
            j, 0], bedpe_df.iloc[j, 1], bedpe_df.iloc[j, 2]
        next_chrom2, next_start2, next_stop2 = bedpe_df.iloc[
            j, 3], bedpe_df.iloc[j, 4], bedpe_df.iloc[j, 5]
        dist1 = max(next_start1 - stop1, start1 - next_stop1)
        dist2 = max(next_start2 - stop2, start2 - next_stop2)
        return (chrom1 == next_chrom1 and chrom2 == next_chrom2
                and dist1 <= merge_win and dist2 <= merge_win)

    # The "chain-breaking" in cluster_loci might still leave some redundancy.
    # In particular, we might leave some almost touching clusters that were
    # separated only because of chain-breaking. Do a second round of clustering
    # where you go through consecutive pairs of cluster and merge them if they're merge-able.
    new_cluster_pairs = {}
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        if cluster_pairs[(cluster1, cluster2)] == -1:
            continue
        # Consider all neighboring clusters after this cluster.
        # Notice that the cluster indices are sorted by genomic coordinates.
        neigh_clusters = [
            (cluster1, cluster2 + 1), (cluster1 + 1, cluster2 - 1),
            (cluster1 + 1, cluster2), (cluster1 + 1, cluster2 + 1)
        ]
        idx = cluster_pairs[(cluster1, cluster2)]
        # Best cluster among neighboring clusters
        max_cluster = ((cluster1, cluster2), idx)
        for next_cluster1, next_cluster2 in neigh_clusters:
            if not (next_cluster1, next_cluster2) in cluster_pairs:
                continue
            if cluster_pairs[(next_cluster1, next_cluster2)] == -1:
                continue
            next_idx = cluster_pairs[(next_cluster1, next_cluster2)]
            if clusters_close(idx, next_idx):
                cluster_pairs[(next_cluster1, next_cluster2)] = -1
                if bedpe_df.iloc[idx]['qual'] < bedpe_df.iloc[next_idx]['qual']:
                    max_cluster = ((next_cluster1, next_cluster2), next_idx)
        new_cluster_pairs[max_cluster[0]] = max_cluster[1]

    cluster_pairs = new_cluster_pairs

    # Now compute the number of mate breakpoints for each cluster
    num_mates = {}
    for (cluster1, cluster2) in cluster_pairs.keys():
        if not cluster1 in num_mates:
            num_mates[cluster1] = 0
        if not cluster2 in num_mates:
            num_mates[cluster2] = 0
        num_mates[cluster1] += 1
        if cluster2 != cluster1:
            num_mates[cluster2] += 1

    sel_loc = []
    new_info_strs = []
    for (cluster1, cluster2) in sorted(cluster_pairs.keys()):
        sv_loc = cluster_pairs[(cluster1, cluster2)]
        if num_mates[cluster1] > max_nmates and num_mates[
                cluster2] > max_nmates:
            continue
        sel_loc.append(sv_loc)
        new_info_strs.append(
            tk_sv_io.update_info(bedpe_df.iloc[sv_loc]['info'],
                                 ['NMATES1', 'NMATES2', 'RESOLUTION'], [
                                     num_mates[cluster1], num_mates[cluster2],
                                     cluster_dist_ratio[(cluster1, cluster2)]
                                 ]))
    if len(sel_loc) > 0:
        bedpe_df = bedpe_df.iloc[sel_loc]
        bedpe_df['info'] = new_info_strs
    else:
        bedpe_df = pd.DataFrame(columns=bedpe_df.columns)
    if not out_bedpe is None:
        tk_sv_io.write_sv_df_to_bedpe(bedpe_df, out_bedpe)

    return bedpe_df
Esempio n. 18
0
def summarize_barcode_data(misc_sm, qual_sms, barcode_whitelist):
    processed_bc_counts = misc_sm.get_summarizer('processed_bc_counts').dict
    metrics = misc_sm.get_summarizer('metrics')
    raw_bc_counts = misc_sm.get_summarizer('raw_bc_counts').dict

    # Compute high-level stats on the barcodes. Only emit bc metrics if we have a whitelist and have barcodes
    # attached
    if barcode_whitelist and len(raw_bc_counts) > 0:

        # What fraction of the whitelist did we see?
        # Don't trust that the whitelist used during attach_bcs is the same as the one used here
        # Must intersect with the raw_bc sequences.  The processed_bc keys have the GEM group prepended.
        observed_good = set(
            raw_bc_counts.keys()).intersection(barcode_whitelist)

        metrics['fraction_bcs_observed'] = \
                min(1.0, tk_stats.robust_divide(float(len(observed_good)), len(barcode_whitelist)))

        # What fraction of clusters had a correct barcode
        total_good_bc_observations = float(sum(processed_bc_counts.values()))
        metrics['correct_bc_rate'] = \
                tk_stats.robust_divide(total_good_bc_observations,
                        (total_good_bc_observations + metrics['bad_bc_count']))

        # 'Effective diversity' of barcodes
        sum_sq = sum((v**2 for v in processed_bc_counts.values()))
        effective_diversity = tk_stats.robust_divide(
            (total_good_bc_observations**2.0), float(sum_sq))
        metrics['effective_diversity'] = effective_diversity

        def fraction_within_f(counts, f):
            med_counts = np.median(counts)
            counts_in_range = np.logical_and(counts >= med_counts / f,
                                             counts <= med_counts * f)
            return np.mean(counts_in_range)

        # fraction of barcodes with abundance within 2x of median
        count_array = np.array(processed_bc_counts.values())
        metrics['fraction_bc_within_2x_median'] = fraction_within_f(
            count_array, 2.0)
        metrics['fraction_bc_within_1.15x_median'] = fraction_within_f(
            count_array, 1.15)

        metrics['barcode_count_cv'] = np.std(count_array) / np.mean(
            count_array)

        raw_count_vect = list(raw_bc_counts.items())
        raw_bc_list = [x[0] for x in raw_count_vect]
        raw_bc_count = np.array([x[1] for x in raw_count_vect], dtype=np.int32)
        is_valid_bc = [bc in barcode_whitelist for bc in raw_bc_list]

        bc_table_cols = {
            'bc_sequence': raw_bc_list,
            'count': raw_bc_count,
            'valid_bc': is_valid_bc
        }
        bc_table = p.DataFrame(bc_table_cols)

    else:
        metrics['fraction_bcs_observed'] = None
        metrics['correct_bc_rate'] = None
        metrics['effective_diversity'] = None
        metrics['fraction_bc_within_2x_median'] = None

        dummy_bc_table = {
            'bc_sequence': np.array([], dtype=np.object),
            'count': np.array([], dtype=np.int32),
            'valid_bc': np.array([], dtype=np.bool)
        }
        for minqv in BC_QUAL_CUTOFFS:
            name = "mapped_minqv_%d_count" % minqv
            dummy_bc_table[name] = np.array([], dtype=np.int32)

            name = "unmapped_minqv_%d_count" % minqv
            dummy_bc_table[name] = np.array([], dtype=np.int32)

        bc_table = p.DataFrame(dummy_bc_table)

    return bc_table
Esempio n. 19
0
def main(args, outs):
    reader = tk_hdf5.DataFrameReader(args.hap_coverage)
    sel_cols = ['cov_q30_hap0', 'cov_q30_hap1', 'cov_q30_hap2']
    ext_cols = list(sel_cols)
    ext_cols.append('total_cov')

    out_loci = []
    summary_df = None
    for (chrom, start, stop) in (tk_io.get_locus_info(l) for l in args.loci):
        cov = reader.query((chrom, start, stop))
        cov['bin'] = np.array(cov['pos'] / args.bin_size, dtype=np.int)
        cov['total_cov'] = cov[sel_cols].sum(axis=1)
        mean_cov = np.mean(cov['total_cov'])
        summary_df = pd.concat([
            summary_df,
            pd.DataFrame(
                {
                    'chrom': chrom,
                    'start': start,
                    'stop': stop,
                    'mean_cov': mean_cov
                },
                index=[0])
        ],
                               ignore_index=True)
        # Remove very small phase sets. These tend to be single-SNP phase sets
        # and can result from erroneous SNPs.
        cov = cov.groupby('phase_set').filter(lambda x: len(x) > 1000)
        sum_df = cov.groupby(['bin',
                              'phase_set'])[ext_cols].mean().reset_index()
        sum_df['low'] = sum_df.total_cov < 0.8 * mean_cov
        sum_df['low_hap0'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap0 < 0.8 * sum_df.cov_q30_hap1)
        sum_df['low_hap1'] = np.logical_and(
            sum_df.total_cov < mean_cov,
            sum_df.cov_q30_hap1 < 0.8 * sum_df.cov_q30_hap0)

        if not sum_df.empty:
            any_low = np.logical_or(
                sum_df.low, np.logical_or(sum_df.low_hap1, sum_df.low_hap0))

            bins = np.array(sum_df['bin'])
            bins = np.concatenate([bins, [np.max(bins) + 1]])
            pos = 0
            # Get runs of 0s and 1s in any_low
            for bit, group in groupby(any_low):
                group_size = len(list(group))
                group_start = bins[pos] * args.bin_size
                group_stop = bins[pos + group_size] * args.bin_size
                region_len = group_stop - group_start
                if bit and region_len >= args.min_len:
                    out_loci.append((chrom, max(0,
                                                group_start - args.bin_size),
                                     group_start + args.bin_size, chrom,
                                     max(0, group_stop - args.bin_size),
                                     group_stop + args.bin_size))
                pos += group_size

    with open(outs.loci, 'w') as f:
        cPickle.dump(out_loci, f)

    summary_df.to_csv(outs.cov_summary, sep='\t', header=True, index=False)
Esempio n. 20
0
def join(args, outs, chunk_defs, chunk_outs):
    pred_to_match, _, pred_df, true_df, min_qv = merge_predictions(chunk_outs)

    # Change TRANS type to DISTAL. This change will only
    # affect the type reported not the names of the metrics.
    new_info = []
    for _, row in pred_df.iterrows():
        sv_type = tk_sv_io.get_sv_type(row.info)
        if sv_type == 'TRANS':
            sv_type = 'DISTAL'
        new_info.append(tk_sv_io.update_info(row.info, ['TYPE'], [sv_type]))
    pred_df['info'] = new_info

    if not true_df is None:
        true_df.to_csv(outs.feasible_gt,
                       index=False,
                       header=True,
                       sep='\t',
                       na_rep='NaN')

    ##### Write BEDPE/VCF outputs
    tk_sv_io.write_sv_df_to_bedpe(pred_df, outs.sv_candidates)
    source_str = '10X/pipelines/stages/analyze_sv_calls {}'.format(
        martian.get_pipelines_version())
    sample_id = 'sample' if args.sample_id is None else args.sample_id
    tk_sv_io.bedpe_to_vcf(outs.sv_candidates, outs.svs.strip('.gz'), sample_id,
                          source_str, args.reference_path)
    # this will sort and gzip
    tk_sv_io.index_sv_vcf(outs.svs.strip(".gz"))
    outs.svs_index = outs.svs + '.tbi'
    # delete the non-gzipped file
    os.remove(outs.svs.strip('.gz'))

    if not pred_df.empty:
        call_df = pred_df[np.logical_or(pred_df['filters'] == '.',
                                        pred_df['filters'] == "PASS")]
    else:
        call_df = None
    tk_sv_io.write_sv_df_to_bedpe(call_df, outs.sv_calls)

    # Annotate each call with the matching ground truth svs. The resulting
    # dataframe might have multiple rows for the same call if there were multiple
    # matching ground truth svs.
    martian.log_info("merging calls and gt")
    if not pred_df.empty:
        pred_df = merge_calls_and_gt(pred_df, true_df, pred_to_match)

    martian.log_info("writing call_tsv")
    pred_df.to_csv(outs.call_tsv,
                   index=False,
                   header=True,
                   sep='\t',
                   na_rep='NaN')

    pred_df = pred_df[np.logical_not(pd.isnull(pred_df['name']))]

    max_dists = sorted(np.array(args.detect_dists))

    gt_sv_types = get_all_sv_types(true_df)
    call_sv_types = get_all_sv_types(pred_df)

    if not true_df is None:
        # Use the default MAX_PPV_TIER unless this is greater than the maximum tier
        # present in the data.
        max_ppv_tier = min(MAX_PPV_TIER, np.max(true_df.tier))
        # Use the default unless this is smaller than the minimum tier present in
        # the data.
        max_sens_tier = max(MAX_SENS_TIER, np.min(true_df.tier))
    else:
        max_ppv_tier = 1
        max_sens_tier = 1

    tiers = [max_ppv_tier, max_sens_tier]

    # All combinations of filters in ground truth and call set
    if not args.targets is None and not args.target_dists is None:
        target_dists = list(sorted(np.array(args.target_dists,
                                            dtype=np.float)))
        target_dists.append(float('NaN'))
    else:
        target_dists = [float('NaN')]

    combs = product([0, 1, 2, None], target_dists, gt_sv_types, tiers,
                    [True, False], call_sv_types, max_dists)

    metrics = defaultdict(list)

    gt_filters = ['genic_breaks', 'target_dist', 'gt_sv_type', 'tier']
    call_filters = ['call_filtered', 'call_sv_type', 'match_dist']

    for (genic_breaks, tdist, gt_sv_type, tier, is_filtered, call_sv_type,
         dist) in combs:
        if gt_sv_type != 'NA' and call_sv_type != 'NA' and gt_sv_type != call_sv_type:
            continue

        metrics['genic_breaks'].append(genic_breaks)
        metrics['target_dist'].append(tdist)
        metrics['gt_sv_type'].append(gt_sv_type)
        metrics['tier'].append(tier)
        metrics['call_filtered'].append(is_filtered)
        metrics['call_sv_type'].append(call_sv_type)
        metrics['match_dist'].append(dist)

        if true_df is None:
            sel_true_df = None
        else:
            sel_true_df = true_df
            if gt_sv_type != 'NA':
                sel_true_df = sel_true_df[sel_true_df.sv_type == gt_sv_type]
            if not np.isnan(tdist):
                sel_true_df = sel_true_df[sel_true_df.targ_dist <= tdist]
            sel_true_df = sel_true_df[sel_true_df.tier <= tier]
            # Restrict to genic or non-genic or take everything if this is None.
            if not genic_breaks is None:
                sel_true_df = sel_true_df[sel_true_df.genic_breaks ==
                                          genic_breaks]

            if len(sel_true_df) == 0:
                sel_true_df = None

        sel_pred_df = pred_df

        if is_filtered and not pred_df.empty:
            sel_pred_df = sel_pred_df[(sel_pred_df.filters == '.') |
                                      (sel_pred_df.filters == 'PASS')]
        if call_sv_type != 'NA' and not pred_df.empty:
            sel_pred_df = sel_pred_df[sel_pred_df.sv_type == call_sv_type]
        if not pred_df.empty and (args.min_rel_overlap is None
                                  or args.min_rel_overlap == 0):
            # Do not apply thi filter if the matching is done based on overlap.
            sel_pred_df = sel_pred_df[np.logical_or(
                np.isnan(sel_pred_df.match_dist),
                sel_pred_df.match_dist <= dist)]

        add_metrics(sel_pred_df, sel_true_df, metrics)

    column_names = gt_filters
    column_names.extend(call_filters)
    other_names = set(metrics.keys()).difference(set(column_names))
    column_names.extend(other_names)

    metric_df = pd.DataFrame(metrics)
    metric_df = metric_df[column_names]

    martian.log_info("writing summary tsv")
    metric_df.to_csv(outs.summary_tsv,
                     index=False,
                     header=True,
                     sep='\t',
                     na_rep='NaN')

    short_metrics = get_short_metrics(metric_df, other_names, max_ppv_tier,
                                      max_sens_tier, args)

    if not args.call_summary is None:
        with open(args.call_summary, 'r') as in_summary_fn:
            in_summary = json.load(in_summary_fn)
            for key, val in in_summary.iteritems():
                short_metrics[key] = val

    short_metrics['min_qv'] = min_qv

    with open(outs.summary, 'w') as out_file:
        out_file.write(
            tenkit.safe_json.safe_jsonify(short_metrics, pretty=True))
Esempio n. 21
0
def get_dataframe_loc(df, loc):
    if len(loc) > 0:
        return df.iloc[loc]
    else:
        return pd.DataFrame(columns=df.columns)
Esempio n. 22
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.coverage is None or args.bait_file is None:
        outs.bait_csv = None
        return

    f = h5py.File(args.coverage)
    has_basic = ('coverage_deduped' in f) and ('mapq30_coverage_deduped' in f)
    has_subsampling = ('coverage_subsampled'
                       in f) and ('coverage_deduped_subsampled' in f)
    f.close()
    if not has_basic: return

    fasta = tenkit.reference.open_reference(args.reference_path)

    df = p.DataFrame()
    coverage_reader = tenkit.hdf5.DataFrameReader(args.coverage)

    #regs = tenkit.bio_io.get_target_regions_dict(args.bait_file)
    #for chrom in regs:
    #    for start, end in regs[chrom]:

    bedIte = tk_io.get_bed_iterator(args.bait_file)
    for chrom, start, end in bedIte:
        if has_subsampling:
            coverage = coverage_reader.query([chrom, start, end],
                                             query_cols=[
                                                 'coverage_deduped',
                                                 'coverage_deduped_subsampled',
                                                 'coverage_subsampled',
                                                 'mapq30_coverage_deduped'
                                             ],
                                             coords=False)
            mean_cov = coverage.mean()
            gc = get_gc(chrom, (start, end), fasta)
            #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
            df = df.append(
                {
                    'chrom':
                    chrom,
                    'start':
                    start,
                    'end':
                    end,
                    'tag':
                    args.tag,
                    'coverage_deduped':
                    mean_cov['coverage_deduped'],
                    'coverage_deduped_subsampled':
                    mean_cov['coverage_deduped_subsampled'],
                    'coverage_subsampled':
                    mean_cov['coverage_subsampled'],
                    'mapq30_coverage_deduped':
                    mean_cov['mapq30_coverage_deduped'],
                    'gc':
                    gc
                },
                ignore_index=True)
        else:
            coverage = coverage_reader.query(
                [chrom, start, end],
                query_cols=['coverage_deduped', 'mapq30_coverage_deduped'],
                coords=False)
            mean_cov = coverage.mean()
            gc = get_gc(chrom, (start, end), fasta)
            #df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
            df = df.append(
                {
                    'chrom': chrom,
                    'start': start,
                    'end': end,
                    'tag': args.tag,
                    'coverage_deduped': mean_cov['coverage_deduped'],
                    'mapq30_coverage_deduped':
                    mean_cov['mapq30_coverage_deduped'],
                    'gc': gc
                },
                ignore_index=True)

    df.to_csv(outs.bait_csv)
Esempio n. 23
0
def get_frags_from_reads(in_bam,
                         chrom1,
                         start1,
                         stop1,
                         chrom2,
                         start2,
                         stop2,
                         min_mapq=60,
                         min_sv_len=45000,
                         min_reads_per_frag=MIN_READS_PER_FRAG_TARGET,
                         min_frag_size=MIN_FRAG_SIZE_TARGET):
    """Reconstruct molecules around two loci."""

    if chrom1 == chrom2 and start2 - stop1 < MAX_FRAG_SIZE:
        # Hard case: The two loci are close enough that we could have molecules
        # spanning them.
        frag_starts = []
        frag_stops = []
        frag_reads = []
        frag_bcs = []

        reads = get_reads(in_bam,
                          chrom1,
                          max(0, start1 - FRAG_EXTEND),
                          stop2 + FRAG_EXTEND,
                          min_mapq=min_mapq).groupby('bc')

        for bc, group in reads:
            poses = np.array(group.pos)
            # Split positions into groups separated by a gap > min_sv_len
            pos_diff = np.where(np.diff(poses) > min_sv_len)[0]
            new_starts = np.concatenate([np.array([0]), pos_diff + 1])
            new_stops = np.concatenate([pos_diff, np.array([len(poses) - 1])])
            frag_starts.extend(poses[new_starts])
            frag_stops.extend(poses[new_stops])
            frag_reads.extend(new_stops - new_starts + 1)
            frag_bcs.extend(repeat(bc, len(pos_diff) + 1))

        frags = pd.DataFrame({
            'bc': frag_bcs,
            'start_pos': frag_starts,
            'end_pos': frag_stops,
            'num_reads': frag_reads
        })
        # Remove spanning fragments
        frags = frags[(frags.start_pos > stop1) | (frags.end_pos < start2)]
        frags1 = frags.copy()
        frags2 = frags.copy()
    else:
        reads1 = get_reads(in_bam,
                           chrom1,
                           max(0, start1 - FRAG_EXTEND),
                           stop1 + FRAG_EXTEND,
                           min_mapq=min_mapq)
        reads2 = get_reads(in_bam,
                           chrom2,
                           max(0, start2 - FRAG_EXTEND),
                           stop2 + FRAG_EXTEND,
                           min_mapq=min_mapq)

        frags1 = reads1.groupby('bc').agg(['min', 'max',
                                           'count'])['pos'].reset_index()
        frags2 = reads2.groupby('bc').agg(['min', 'max',
                                           'count'])['pos'].reset_index()

        frags1.columns = ['bc', 'start_pos', 'end_pos', 'num_reads']
        frags2.columns = ['bc', 'start_pos', 'end_pos', 'num_reads']

    frags1 = frags1[(frags1.end_pos > start1) & (frags1.start_pos < stop1) & \
                    (frags1.num_reads > min_reads_per_frag) & \
                    (frags1.end_pos - frags1.start_pos > min_frag_size)]
    frags2 = frags2[(frags2.end_pos > start2) & (frags2.start_pos < stop2) & \
                    (frags2.num_reads > min_reads_per_frag) & \
                    (frags2.end_pos - frags2.start_pos > min_frag_size)]
    frags1['chrom'] = chrom1
    frags2['chrom'] = chrom2

    return frags1, frags2
Esempio n. 24
0
def join(args, outs, chunk_defs, chunk_outs):

    summary = {}
    # Compute high-level BC summary metrics
    # Load BC data
    if args.barcodes:
        bc_df = tenkit.hdf5.read_data_frame(args.barcodes)
        fragment_df = tenkit.hdf5.read_data_frame(args.fragments, query_cols=['bc', 'chrom', 'start_pos'])

        bc_df.sort('bc_num_reads', inplace=True)
        bc_df['cum_reads'] = np.cumsum(bc_df.bc_num_reads)

        # Measure coalescence rate on all BCs that could conceivably be used
        # to call SVs - i.e. ignore BCs that contribute the cumulative bottom 1% of reads
        n99_read_thresh = sum(bc_df.bc_num_reads) * 0.01
        n99_bcs = bc_df[bc_df.cum_reads > n99_read_thresh]
        martian.log_info("number of bcs to screen for coalescence: %d" % len(n99_bcs))
        martian.log_info("subsetting fragments to use")

        if len(n99_bcs) > 1:
            selected_frags = fragment_df[fragment_df.bc.isin(n99_bcs.bc)]
            del fragment_df
            martian.log_info("Doing coalescence calculation")
            coa_calc = coalescence.BcSimilarity(selected_frags, set(n99_bcs.bc), args.input)
            coa_bc_tbl = coa_calc.coalescence_analysis()

            # Also add barcodes that are extreme outliers in the number of fragments observed
            med_frags_per_bc = n99_bcs.bc_num_fragments.median()
            high_quantile = n99_bcs.bc_num_fragments.quantile(0.98)
            bc_num_fragments_threshold = max(med_frags_per_bc*5.0, high_quantile)

            med_reads_per_bc = n99_bcs.bc_num_reads.median()
            high_quantile = n99_bcs.bc_num_reads.quantile(0.98)
            bc_num_reads_threshold = max(med_reads_per_bc*5.0, high_quantile)

            overloaded_bcs = n99_bcs[(n99_bcs.bc_num_fragments > bc_num_fragments_threshold) | (n99_bcs.bc_num_reads > bc_num_reads_threshold)]
            summary['fract_bcs_overloaded'] = float(len(overloaded_bcs)) / len(n99_bcs)

            # Remove bcs that are already in the blacklist
            nr_overloaded_bcs = overloaded_bcs[~overloaded_bcs.bc.isin(coa_bc_tbl.bc)]

            # Add overloaded bcs to blacklist
            overloaded_bc_tbl = p.DataFrame({'bc': nr_overloaded_bcs.bc, 'cluster_id': -1, 'cluster_size': -1})

            # Write barcode blacklist
            bad_bc_tbl = p.concat([coa_bc_tbl, overloaded_bc_tbl])
            bad_bc_tbl.to_csv(outs.barcode_blacklist, sep="\t", index=False)

            # Compute coalescence stats
            summary['fract_bcs_in_clusters_all'] = float(len(coa_bc_tbl)) / len(n99_bcs)
            summary['fract_bcs_in_clusters_eq_2'] = float((coa_bc_tbl.cluster_size == 2).sum()) / len(n99_bcs)
            summary['fract_bcs_in_clusters_gt_2'] = float((coa_bc_tbl.cluster_size > 2).sum()) / len(n99_bcs)
            summary['num_clusters_gt_8'] = (coa_bc_tbl.cluster_size > 8).sum()

            # Compute stats ignoring clusters of Hamming distance 2
            hd2_clusters = []
            for cluster in coa_bc_tbl.groupby('cluster_id'):
                if all_within_hamming_distance(cluster[1].bc.values, 2):
                    hd2_clusters.append(cluster[0])

            coa_tbl_no_hd2 = coa_bc_tbl[~coa_bc_tbl.cluster_id.isin(hd2_clusters)]
            summary['fract_bcs_in_clusters_all_no_hd2'] = float(len(coa_tbl_no_hd2)) / len(n99_bcs)
            summary['fract_bcs_in_clusters_eq_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size == 2).sum()) / len(n99_bcs)
            summary['fract_bcs_in_clusters_gt_2_no_hd2'] = float((coa_tbl_no_hd2.cluster_size > 2).sum()) / len(n99_bcs)

        else:
            empty_df = p.DataFrame({'bc':[], 'cluster_id':[], 'cluster_size':[]})
            empty_df.to_csv(outs.barcode_blacklist, sep="\t", index=False)

            # null coalescence stats
            summary['fract_bcs_overloaded'] = None
            summary['fract_bcs_in_clusters_all'] = None
            summary['fract_bcs_in_clusters_eq_2'] = None
            summary['fract_bcs_in_clusters_gt_2'] = None
            summary['num_clusters_gt_8'] = None
            summary['fract_bcs_in_clusters_all_no_hd2'] = None
            summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None
            summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None

    else:
        outs.barcode_blacklist = None
        summary['fract_bcs_overloaded'] = None
        summary['fract_bcs_in_clusters_all'] = None
        summary['fract_bcs_in_clusters_eq_2'] = None
        summary['fract_bcs_in_clusters_gt_2'] = None
        summary['num_clusters_gt_8'] = None
        summary['fract_bcs_in_clusters_all_no_hd2'] = None
        summary['fract_bcs_in_clusters_eq_2_no_hd2'] = None
        summary['fract_bcs_in_clusters_gt_2_no_hd2'] = None


    # Write summary to json
    with open(outs.filter_barcodes_results, 'w') as results_file:
        tenkit.safe_json.dump_numpy(summary, results_file, pretty=True)
Esempio n. 25
0
    def get_stats_at_breaks(self, breakpoints, win_left, win_right,
        min_reads = 100, min_mapq = 60, method = BINOM_EMP_BC_COUNT_BC_FREQ,
        outward_only = False):

        columns = ['chrom1', 'start1', 'stop1', 'extStart1', 'extStop1',
            'chrom2', 'start2', 'stop2', 'extStart2', 'extStop2',
            'bcOv', 'nbcs1', 'nbcs2', 'readOv', 'nreads1', 'nreads2', 'binomQual', 'qual',
            'bcs', 'bcFreqs']

        stat_df = pd.DataFrame(columns = columns, index = np.arange(len(breakpoints)))

        in_bam = tk_bam.create_bam_infile(self.bam)

        nbcs = len(self.bc_map)

        for bidx, breakpoint in enumerate(breakpoints):
            chrom1, start1, stop1, chrom2, start2, stop2 = breakpoint[0:6]
            if (chrom1, start1, stop1) > (chrom2, start2, stop2):
                chrom1, start1, stop1, chrom2, start2, stop2 = chrom2, start2, stop2, chrom1, start1, stop1

            bc1_counts, ext_start1, ext_stop1 = self.get_bcs_around_break(in_bam, chrom1, start1, stop1,
                win_left, (0 if outward_only else win_right), min_reads = min_reads, min_mapq = min_mapq)
            bc2_counts, ext_start2, ext_stop2  = self.get_bcs_around_break(in_bam, chrom2, start2, stop2,
                (0 if outward_only else win_left), win_right, min_reads = min_reads, min_mapq = min_mapq)

            bc_ov = set(bc1_counts.keys()).intersection(set(bc2_counts.keys()))

            nbcs1, nbcs2 = len(bc1_counts), len(bc2_counts)
            read_list1 = []
            for b, reads in bc1_counts.iteritems():
                read_list1.extend(reads)
            read_list1 = set(read_list1)
            read_list2 = []
            for b, reads in bc2_counts.iteritems():
                read_list2.extend(reads)
            read_list2 = set(read_list2)
            nreads1 = len(read_list1)
            nreads2 = len(read_list2)
            read_ov = len(read_list1.intersection(read_list2))

            if len(bc_ov) > 0:
                bc_idx = np.array([self.bc_map[b] for b in bc_ov]).flatten()
                win_idx = np.zeros(bc_idx.shape, dtype = np.int)
                if method == BINOM_EMP_BC_COUNT_BC_FREQ or method == 5:
                    pov = log10_emp_pval(win_idx, bc_idx, max(nbcs1, nbcs2), self.bc_freq)
                elif method == BINOM:
                    pov = log10_binom_pval(len(bc_ov), nbcs1, nbcs2, nbcs)
                else:
                    martian.throw('Unsupported method for quality computation.')
                qual = pval_to_qual(pov)
                binom_qual = pval_to_qual(log10_binom_pval(len(bc_ov), nbcs1, nbcs2, nbcs))
            else:
                qual = 0
                binom_qual = 0
            bcs = ','.join(list(bc_ov))
            bc_freqs = ','.join(['{:.2f}'.format(-np.log10(self.bc_freq[self.bc_map[b]])) for b in bc_ov])

            stat_df.loc[bidx] = [chrom1, int(start1), int(stop1), ext_start1, ext_stop1,
                chrom2, int(start2), int(stop2), ext_start2, ext_stop2,
                len(bc_ov), nbcs1, nbcs2, read_ov, nreads1, nreads2, binom_qual, qual, bcs, bc_freqs]

        in_bam.close()

        return stat_df