Ejemplo n.º 1
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end
    chunk_index = args.chunk_index

    prefixes = get_seqs(args.nbases)
    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0))

    bams_out = {}
    for prefix in prefixes:
        filename = martian.make_path("bc_{}.bam".format(prefix))
        bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template)

    non_bc_bam = martian.make_path("bc_{}.bam".format(None))
    non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template)
    for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)):
        barcode = crdna_io.get_read_barcode(read)
        if barcode is None:
            non_bc_bam_out.write(read)
        else:
            prefix = barcode[:args.nbases]
            bams_out[prefix].write(read)
    bam_in.close()

    non_bc_bam_out.close()
    sort_bam(non_bc_bam)
    outs.non_bc_bams = [non_bc_bam]

    outs.buckets = {}
    for prefix in prefixes:
        filename = bams_out[prefix].filename
        bams_out[prefix].close()
        sort_bam(filename)
        outs.buckets[prefix] = filename
Ejemplo n.º 2
0
def main(args, outs):

    if args.flowcell_geometry is None:
        return

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)

    null_distribution = compute_null_distribution(args.flowcell_geometry,
                                                  seed=args.seed)

    estimator = DupSummary.diffusion_estimator(lane_coord_sys,
                                               args.flowcell_geometry)

    consumers = [estimator.read_consumer()]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Package up the summaries:
    dup_results = {
        'null_distribution': null_distribution,
        "observed_distribution": estimator.result
    }

    if outs.summary:
        with open(outs.summary, 'w') as f:
            json.dump(dup_results, f, indent=4)
Ejemplo n.º 3
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for qname in args.qnames:
        filename = os.path.join(tmp_dir, "qname_%s.bam" % qname)
        bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in)
        bams_out[qname] = bam_out
        outs.buckets[qname] = filename
        buckets[qname] = []

    qname_ranges = zip(args.qnames, args.qnames[1:])
    for r in reads:
        qname = None
        for qnames in qname_ranges:
            if qnames[0] <= r.qname and r.qname <= qnames[1]:
                qname = qnames[0]
                break
        if qname is None:
            qname = args.qnames[-1]
        buckets[qname].append(r)

    for qname, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[qname]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Ejemplo n.º 4
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.input)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam,
                                         (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.output,
                                           None,
                                           None,
                                           template=in_bam)

    chroms = in_bam.references
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  chroms=chroms)

    for (gg, bc, gene_ids), reads_iter in itertools.groupby(
            in_bam_chunk, key=cr_utils.barcode_sort_key):
        # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene
        if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1:
            for read in reads_iter:
                reporter.mark_dupes_corrected_cb(read)
                out_bam.write(read)
            continue

        reads = list(reads_iter)
        gene_id = gene_ids[0]

        # Count cDNA PCR duplicates with uncorrected UMIs
        dupe_key_umi_counts = mark_dupes(
            bc, gene_id, reads, args,
            cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE,
            cr_utils.cdna_pcr_dupe_func, reporter)

        # Record UMI corrections
        umi_corrections = correct_umis(dupe_key_umi_counts)

        # Mark duplicates for cDNA PCR duplicates with corrected UMIs
        mark_dupes(bc,
                   gene_id,
                   reads,
                   args,
                   cr_constants.CDNA_PCR_DUPE_TYPE,
                   cr_utils.cdna_pcr_dupe_func,
                   reporter,
                   corrected_dupe_keys=umi_corrections,
                   out_bam=out_bam)

        # Count duplicates for SI PCR duplicates with uncorrected UMIs
        mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE,
                   cr_utils.si_pcr_dupe_func, reporter)

    in_bam.close()
    out_bam.close()
    reporter.save(outs.chunked_reporter)
Ejemplo n.º 5
0
def main_bucket_reads_by_bc(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    prefixes = get_seqs(args.nbases)

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for prefix in prefixes:
        filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in)
        bams_out[prefix] = bam_out
        outs.buckets[prefix] = filename
        buckets[prefix] = []

    non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default,
                                                  None,
                                                  None,
                                                  template=bam_in)
    non_bc_reads = []
    for r in reads:
        barcode = tk_io.get_read_barcode(r)
        if barcode is None:
            non_bc_bam_out.write(r)
            non_bc_reads.append(r)
        else:
            prefix = barcode[:args.nbases]
            buckets[prefix].append(r)
    non_bc_bam_out.close()

    # Set random seed to get deterministic qname subsampling
    random.seed(0)
    sampled_non_bc_reads = random.sample(non_bc_reads,
                                         min(len(non_bc_reads), len(prefixes)))
    outs.qnames = [read.qname for read in sampled_non_bc_reads]

    for prefix, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[prefix]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.input)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.output, None, None, template=in_bam)
    cell_bcs = set(cr_utils.load_barcode_tsv(args.cell_barcodes))

    for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key):
        dupe_keys = set()
        for read in reads_iter:
            if cr_utils.get_read_barcode(read) not in cell_bcs:
                continue

            if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq(args.align)):
                dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read))
                if dupe_key in dupe_keys:
                    continue

                dupe_keys.add(dupe_key)
                out_bam.write(read)
Ejemplo n.º 7
0
def main_report_basic(args, outs):
    bam_in = pysam.Samfile(args.input, check_sq=False)
    targets_filename = args.targets_file
    references = bam_in.references

    if args.input_pos is not None:
        bam_in_pos = tk_bam.create_bam_infile(args.input_pos)
        n_mapped = bam_in_pos.mapped
        n_chunk = math.ceil(n_mapped / args.n_chunks)
        bam_in_pos.close()
    else:
        n_mapped = 0
        n_chunk = 0

    if targets_filename is None or targets_filename == '':
        target_regions = None
    else:
        targets_file = open(targets_filename, 'r')
        target_regions = tk_io.get_target_regions(targets_file)

    if args.barcode_whitelist:
        barcode_whitelist = bc_utils.load_barcode_whitelist(
            args.barcode_whitelist)
    else:
        barcode_whitelist = None

    bam_slice = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))

    # do basic counting
    misc_sm, qual_sms = \
            compute_basic_stats(bam_slice,
                    target_regions,
                    n_chunk,
                    references,
                    barcode_whitelist)

    misc_sm.save(outs.misc_sm)
    with open(outs.qual_sms, 'wb') as out_handle:
        pickle.dump(qual_sms, out_handle)
Ejemplo n.º 8
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.filtered_bam, None, None, template=in_bam)
    cluster_bcs = set(args.cluster_bcs)

    for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key):
        dupe_keys = set()
        for read in reads_iter:
            if cr_utils.get_read_barcode(read) not in cluster_bcs:
                continue

            if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq({"high_conf_mapq":60})):
                dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read))
                if dupe_key in dupe_keys:
                    continue

                dupe_keys.add(dupe_key)
                read.is_duplicate = False
                out_bam.write(read)
Ejemplo n.º 9
0
def main_report_single_partition(args, outs):
    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.fragments = None
        return

    bam_in = tk_bam.create_bam_infile(args.input)

    if args.targets_file is None:
        target_regions = None
    else:
        target_regions = tk_io.get_target_regions(open(args.targets_file))

    # Bail out if we're on a small genome
    if sum(bam_in.lengths) < 3e6:
        outs.fragments = None
        return

    bam_chunk = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))
    # Skip reads without a barcode
    bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk)
    bc_read_iter = itertools.groupby(bam_chunk_filt,
                                     lambda x: tk_io.get_read_barcode(x))

    bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance,
                                 bam_in.references, target_regions)
               for (bc, reads) in bc_read_iter)
    bc_data_filt = (x for x in bc_data if x is not None)

    frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt)
    if frag_tbl is not None:
        # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption
        frag_tbl.sort(['chrom', 'start_pos'], inplace=True)
        tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl)
        tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos',
                                       'end_pos')
    if bc_tbl is not None:
        tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
Ejemplo n.º 10
0
def main(args, outs):
    ref = contig_manager.contig_manager(args.reference_path)
    args.coerce_strings()
    outs.coerce_strings()

    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.summary = None
        return

    bam_in = tk_bam.create_bam_infile(args.input)
    bam_chunk = tk_bam.read_bam_chunk(bam_in,
                                      (args.chunk_start, args.chunk_end))

    # Skip reads without a barcode
    bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk)
    bc_read_iter = itertools.groupby(bam_chunk_filt,
                                     lambda x: crdna_io.get_read_barcode(x))

    counts = {}

    for bc, reads in bc_read_iter:
        for r in reads:
            contig = bam_in.references[r.tid]
            species = ref.species_from_contig(contig)
            if not species in counts:
                counts[species] = {}
            if not bc in counts[species]:
                counts[species][bc] = 0
            if r.is_secondary or r.is_supplementary:
                ## we are ignoring alternate alignments
                continue
            if (r.is_unmapped or r.mapping_quality < CELL_DETECT_MAPQ_THRESHOLD
                    or r.is_duplicate):
                ## if read is unmapped, poor mapping quality or dup
                continue
            counts[species][bc] += 1
    outs.counts = counts
Ejemplo n.º 11
0
def main(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0))
    
    if args.write_bam:
        bam_prefix, ext = os.path.splitext(outs.output)
        out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext
        bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template,
                                               pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(),
                                                                          "mark_duplicates")])
        outs.index = None # chunk bams don't get indexed
    else:
        bam_out = None
        outs.output = None
        outs.index = None

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
    else:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)


    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)

    dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs]

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    if bam_out:
        bam_out.close()
        # Note - the indexing happens in join
        bam_prefix, _ = os.path.splitext(outs.output)
        tk_bam.sort(out_bam_name, bam_prefix)

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups, custom_diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_old_" + desc
        dup_results[diff_desc] = diff_dups
        custom_diff_desc = "diffusion_" + desc
        dup_results[custom_diff_desc] = custom_diff_dups

    if outs.duplicate_summary:
        with open(outs.duplicate_summary, 'w') as f:
            json.dump(dup_results, f, indent=4)
Ejemplo n.º 12
0
def main(args, outs):
    """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics.
    PCR duplicates have the same read1 start site and read2 start site.
    """
    args.coerce_strings()
    outs.coerce_strings()

    # Chunk output doesn't get indexed
    outs.fragments_index = None
    outs.index = None

    # Pull in prior likelihoods for barcodes
    raw_barcode_abundance = None
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
    if args.raw_barcode_counts is not None and barcode_whitelist is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        raw_barcode_abundance = {
            '{}-{}'.format(barcode, gem_group): count
            for gem_group, subdict in raw_counts.iteritems()
            for barcode, count in zip(barcode_whitelist, subdict['bc_counts'])
        }

    bam_in = create_bam_infile(args.input)
    bam_refs = bam_in.references

    bam_prefix, ext = os.path.splitext(outs.output)
    raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' +
                                     ext)

    frag_prefix, ext = os.path.splitext(outs.fragments)
    raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext)

    # only write CO line for one chunk, so we don't have duplicates after samtools merge
    if args.chunk_num == 0:
        COs = [
            '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)',
            '10x_bam_to_fastq_seqnames:R1,R3,I1,R2'
        ]
    else:
        COs = None

    bam_out, _ = tk_bam.create_bam_outfile(
        raw_bam_file,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "mark_duplicates", TENX_PRODUCT_NAME)
        ],
        cos=COs)
    fragments_out = open(raw_frag_file, 'w')
    bam_in.reset()

    # Ensure the summary key indicates what kind of dup marking was actually performed.
    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)
    reference_manager = ReferenceManager(args.reference_path)
    summarizer = DupSummary(split_bcs=False,
                            lane_coordinate_system=lane_coord_sys,
                            output_bam=bam_out,
                            output_tsv=fragments_out,
                            ref=reference_manager,
                            bam_refs=bam_refs,
                            priors=raw_barcode_abundance)

    # Now broadcast the selected reads to the summarizers
    consumers = [summarizer.read_consumer()]
    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Close outfiles
    bam_out.close()
    fragments_out.close()

    # Feed the chunk barcode_counts data back to join()
    with open(outs.singlecell_mapping, 'w') as outfile:
        pickle.dump(summarizer.bc_counts, outfile)

    # Sort the output bam & tsv files
    sort_bam(raw_bam_file,
             outs.output,
             threads=martian.get_threads_allocation())
    sort_bed(raw_frag_file,
             outs.fragments,
             genome=reference_manager.fasta_index,
             threads=martian.get_threads_allocation(),
             leave_key=True)
Ejemplo n.º 13
0
def main_mark_duplicates(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    bam_out, tids = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pg=tk_bam.make_pg_header(martian.get_pipelines_version(),
                                 "mark_duplicates"))

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [
        tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)
    ]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # We do the subsampling to achieve the desired coverage on _perfect reads_, as
    # defined by tenkit.read_filter.stringent_read_filter.  This is tallied in ATTACH_BCS,
    # and passed into the perfect_read_count argument.  We will fail if it's not supplied.
    total_coverage = args.estimated_coverage

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    sampling_rates = []
    for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES:
        rate = tk_stats.robust_divide(float(sample_cov), total_coverage)
        sampling_rates.append((rate, sample_cov))

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys, bam_out)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           write_to_stdout=False)
    else:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           bam_out,
                                           write_to_stdout=False)

    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs",
                                  lane_coord_sys)

    # Make a battery of duplicate summaries at different coverages, with and w/o
    # barcode splitting
    split_options = [True, False]

    dup_sums = [
        full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs,
        no_filter_dups_no_bcs
    ]

    # Duplicate marking on perfect reads subsampled to the requested coverage
    for (sr, cov) in sampling_rates:
        for split_bc in split_options:
            description = "cov_" + str(cov) + ('_use_bcs'
                                               if split_bc else '_ignore_bcs')
            dup_sums.append(
                DupSummary(True, sr, split_bc, description, lane_coord_sys))

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [
        x.read_consumer() for x in dup_sums
        if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)
    ]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    bam_out.close()
    # Note - the indexing happens in join

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_" + desc
        dup_results[diff_desc] = diff_dups

    if outs.duplicate_summary:
        f = open(outs.duplicate_summary, 'w')
        json.dump(dup_results, f)
        f.close()
Ejemplo n.º 14
0
def main(args, outs):
    #min_insert_size = 0
    #max_insert_size = 1e4
    
    ## sc purity threshold: what fraction of contamination by another species
    ## will we tolerate
    SC_PURITY_THRESHOLD = 0.95

    args.coerce_strings()
    outs.coerce_strings()

    # Bail out if there no valid barcodes
    if args.barcode_whitelist is None or args.input is None:
        outs.summary = None
        return

    ## group bam records by barcode NO_BARCODE/raw barcode tag/processed barcode tag
    bam_in = tk_bam.create_bam_infile(args.input)
    bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    bc_read_iter = itertools.groupby(bam_chunk, groupbybarcode)

    ## compute species_list
    refs = bam_in.references
    ref = contig_manager.contig_manager(args.reference_path)
    species_list = ref.list_species()
    has_species_info = (species_list != [""])
    species_list.sort()
    genome_size = sum(ref.get_contig_lengths().values())

    ## index cells of each species
    cell_index = {}
    for sp in species_list:
        bc_list = args.cell_barcodes.get(sp, {}).keys()
        bc_list.sort( )
        for i, b in enumerate(bc_list):
            y = cell_index.get(b, "")
            if len(y) == 0:
                cell_index[b] = "%s_cell_%d"%(sp, i)
            else:
                cell_index[b] = y + "_" + "%s_cell_%d"%(sp, i)

    ## construct and write header for barnyard file
    barnyard_file = open(outs.barnyard, 'w')
    barnyard_header = (['BC'] + ["cell_id"] +
        [s+("_" if has_species_info else "")+"reads_mapq_60" for s in species_list] +
        [s+("_" if has_species_info else "")+"contigs" for s in species_list] +
        ['mapped',
        'num_mapped_bases',
        'soft_clip_frac',
        'insert_p50',
        'num_mapped_pos',
        'mapped_frac',
        'amp_rate',
        'library_complexity',
        'dup_ratio',
        'num_pairs'] +
        ["is_%s_cell_barcode"%s for s in species_list])
    waste_keys = ["no_barcode", "non_cell_barcode", "unmapped",
                  "low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD,
                  "dups", "denominator", "unusable_read"]
    fractional_waste_keys = [
                  "no_barcode_frac", "non_cell_barcode_frac", "unmapped_frac",
                  "low_mapq_lt_%d_frac"%PROFILE_MAPQ_THRESHOLD, "dups_frac"]

    barnyard_header.extend(waste_keys)
    barnyard_header.extend(fractional_waste_keys)
    barnyard_file.write( ",".join(barnyard_header) + "\n" )

    ## wasted data categories

    ## construct and write header for barnyard_hits file
    barnyard_hits_file = open( outs.barnyard_hits, "w" )
    bh_header = ["barcode", "is_whitelisted"]
    bh_header.extend(["is_%s_cell_barcode"%s for s in species_list])
    bh_header.extend([refname for refname in bam_in.references])
    barnyard_hits_file.write( ",".join(bh_header) + "\n" )

    # For each barocode, count # per each contig, number per each window (for each window size)
    # number per species (if available in contig), number per species
    # TODO: Add detailed matrix by contigs, windows output
    num_sc_bcs = 0
    num_qual_reads = 0
    num_sc_reads = 0

    ploidy = 2
    bc_hist = {}

    ## count number of raw barcodes that exactly match whitelist
    ## without any error correction
    raw_bc_on_whitelist = 0
    # dup_summary = json.load(open(args.duplicate_summary))
    # pcr_dup_fraction = dup_summary['dup_fraction']['pcr']
    #barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)
    for bc, reads in bc_read_iter:
        ## collect various forms of wasted data here per barcode
        wastebin = defaultdict(int)

        bh_hits = [0 for _ in bam_in.references]
        dup_count = 1
        non_dup = 1
        bc_count = 0
        num_per_species = defaultdict(int)
        contigs_per_species = defaultdict(set)

        total_reads_by_clip = np.zeros(2, dtype=float)

        insert_length = []
        num_pairs = 0
        num_mapped = 0
        num_mapped_bases = 0
        pos_set = set([])
        for r in reads:
            ## secondary/supplementary are never counted towards anything
            if r.is_secondary or r.is_supplementary:
                continue

            ## include everything in the denominator
            wastebin["denominator"] += 1

            ## how many reads have >= 10 soft clipped bases
            if r.cigartuples is not None:
                cigar_dict = dict(r.cigartuples)
                soft_clip_index = int(cigar_dict.get(4, 0) >= 10)
                total_reads_by_clip[soft_clip_index] += 1

            if barnyard_hits_include(r):
                bh_hits[r.tid] += 1
            ## non-whitelisted barcodes count as wasted data
            if not "-" in bc:
                wastebin["no_barcode"] += 1
                continue

            if bc[:-2] == r.get_tag(RAW_BARCODE_TAG):
                raw_bc_on_whitelist += 1

            is_cell_bc_read = True

            ## waste hierarchy
            ## if not a cell or if read doesn't belong to species, then waste
            ## else if not mapped, then waste
            ## else if mapq< 30, then waste
            ## else if dup, then waste

            ## is this is a contaminant read from a different species
            ## it is wasted
            contig = refs[r.tid]
            read_species = ref.species_from_contig(contig)
            if ( not(read_species in args.cell_barcodes) or
                 not(bc in args.cell_barcodes[read_species]) ):
                wastebin["non_cell_barcode"] += 1
                is_cell_bc_read = False
            elif r.is_unmapped:
                wastebin["unmapped"] += 1
            elif r.mapq < PROFILE_MAPQ_THRESHOLD:
                wastebin["low_mapq_lt_%d"%PROFILE_MAPQ_THRESHOLD] += 1
            elif r.is_duplicate:
                wastebin["dups"] += 1
            bad_map_or_dup = (r.is_unmapped or
                              (r.mapq < PROFILE_MAPQ_THRESHOLD) or
                              r.is_duplicate)

            if is_cell_bc_read:
                bc_count += 1
                # if (stringent_read_filter(r, True) and
                #         not(r.is_unmapped) and not(r.mate_is_unmapped)):
                #     if r.is_duplicate:
                #         dup_count += 1
                #     else:
                #         non_dup += 1
                if r.has_tag(DUPLICATE_COUNT_TAG):
                    dup_count += r.get_tag(DUPLICATE_COUNT_TAG)
                    non_dup += 1
            elif bad_map_or_dup:
                # unusable reads are those that are non-cell barcodes that are
                # also any of unmapped, low mapq, nor dups
                wastebin['unusable_read'] += 1

            ## whether we have a cell barcode or not, count these stats
            if not bad_map_or_dup:
                num_mapped += 1
                num_mapped_bases += r.reference_length

                pos_set.add((r.reference_name, r.reference_start/1000))

                ## if read is part of a proper pair, only count read or its pair
                if r.is_proper_pair:
                    if r.is_read1:
                        insert_length.append( r.template_length )
                        num_pairs += 1
                    else:
                        continue

                ## Use MAPQ >= 60 to get accurate mappings only for barnyard stuff
                if r.mapq < 60:
                    continue
                num_qual_reads += 1
                if has_species_info:
                    num_per_species[read_species] += 1
                    contigs_per_species[read_species].add(contig)
            ## end of loop over reads in this barcode
            assert wastebin['denominator'] - wastebin['no_barcode'] - wastebin['unusable_read'] == num_mapped + \
                   wastebin["low_mapq_lt_%d" % PROFILE_MAPQ_THRESHOLD] + wastebin['unmapped'] + wastebin['dups']

        ## compute the library complexity and amp
        ## NOTE: insert length is hardcoded as 250, so the amp rate is really the
        ## library complexity in different units 
        num_amplicons = num_mapped - num_pairs
        dup_ratio = tk_stats.robust_divide(float(dup_count + non_dup), float(non_dup))
        
        library_complexity = tk_stats.robust_divide(num_amplicons, (dup_ratio-1.0)*2)

        amp_rate = tk_stats.robust_divide(float(library_complexity * DEFAULT_AMPLICON_LENGTH) ,
            float(ploidy * genome_size))

        bc_hist[bc] = bc_count
        map_rate = tk_stats.robust_divide(float(num_mapped), wastebin["denominator"])
        
        ## write row to barnyard_hits file
        bh_row = [ bc, int("-" in bc)]
        for s in species_list:
            bh_row.append( int(s in args.cell_barcodes and bc in args.cell_barcodes[s]) )
        bh_row.extend( bh_hits )
        barnyard_hits_file.write(",".join(map(str, bh_row)) + "\n" )

        ## write row to barnyard file
        barnyard_row = ([bc, cell_index.get(bc, "None")] +
            [num_per_species[s] for s in species_list] +
            [len(contigs_per_species[s]) for s in species_list] +
            [num_mapped, num_mapped_bases] +
            [tk_stats.robust_divide(total_reads_by_clip[1], sum(total_reads_by_clip)),
            np.median(insert_length) if len(insert_length) else np.nan,
            len(pos_set),
            map_rate,
            amp_rate,
            library_complexity,
            dup_ratio,
            num_pairs])
        for speci in species_list:
            barnyard_row.append( int((speci in args.cell_barcodes) and 
                (bc in args.cell_barcodes[speci])) )

        for key in waste_keys:
            fkey = key + "_frac"
            if (fkey in fractional_waste_keys):
                wastebin[fkey] = tk_stats.robust_divide(float(wastebin[key]), float(wastebin["denominator"]))
        barnyard_row.extend( [ wastebin[x] for x in waste_keys ] )
        barnyard_row.extend( [ wastebin[x] for x in fractional_waste_keys ] )

        barnyard_file.write( ",".join(map(str, barnyard_row)) + "\n")
        
        ## metrics relating to purity - only for multi species
        if has_species_info and len(species_list) >= 2:
            counts_by_species = [float(num_per_species[s]) for s in species_list]

            major_species_index = np.argmax( counts_by_species )
            major_species = species_list[major_species_index]
            species_purity = tk_stats.robust_divide( counts_by_species[major_species_index],
                np.sum(counts_by_species) )

            if species_purity >= SC_PURITY_THRESHOLD:
                num_sc_bcs += 1
                num_sc_reads += num_per_species[major_species]
        ## END of loop over barcodes

    summary_info = {}
    summary_info['num_sc_bcs'] = num_sc_bcs
    summary_info['num_sc_qual_reads'] = num_qual_reads
    summary_info['num_sc_reads'] = num_sc_reads
    summary_info['raw_bc_on_whitelist'] = raw_bc_on_whitelist

    barnyard_file.close()
    barnyard_hits_file.close()
    
    with open(outs.summary, 'w') as summary_file:
        summary_file.write(tenkit.safe_json.safe_jsonify(summary_info))

    with open(outs.barcode_histogram, 'w') as bc_hist_file:
        bc_hist_file.write(tenkit.safe_json.safe_jsonify(bc_hist))
Ejemplo n.º 15
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))
    pgs = [
        tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"),
        tk_bam.make_terminal_pg_header(martian.get_pipelines_version())
    ]
    # dont duplicate header if already there, this is for developer testing purposes
    PG = bam_in.header['PG']
    for item in PG:
        if item['ID'] == "attach_phasing":
            pgs = []
    bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs)

    # File with contig phasing information
    if args.fragment_phasing is not None:
        frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)
    else:
        frag_phasing = None

    if args.fragments is not None:
        # Fragments file for global molecule id
        frag_id_reader = tk_hdf5.DataFrameReader(args.fragments)
    else:
        frag_id_reader = None

    # Phasing data
    ph_db = None
    ph_db_chrom = None
    ph_db_start = None
    ph_db_end = None

    # Fragment data - for global molecule id
    fr_db = None
    fr_db_chrom = None
    fr_db_start = None
    fr_db_end = None

    total_reads = 0
    phased_reads = 0
    molecule_tagged_reads = 0

    for r in reads:
        chrom = bam_in.references[r.tid]
        pos = r.pos
        bc = tk_io.get_read_barcode(r)

        total_reads += 1
        tags = r.tags

        # Strip out RX and QX tags
        #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG]
        # Actually don't strip
        strip_tags = []
        tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)]

        # Fetch from the fragments file to get records that should cover many future reads
        # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip
        if frag_phasing is not None:
            if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end:
                ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and ph_db.has_key(bc):
                frags = ph_db[bc]
                # See if we having phasing for this fragment
                valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(valid_phasing) < 2)
                if len(valid_phasing) == 1:
                    phased_reads += 1
                    read_phasing = valid_phasing[0]
                    tags.append((PHASE_SET_BAM_TAG, read_phasing['ps']))
                    tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap']))
                    tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc']))

        if frag_id_reader is not None:
            # Fetch from the fragments file to get records that should cover many future reads
            if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end:
                fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and fr_db.has_key(bc):
                frags = fr_db[bc]
                # See if we having phasing for this fragment
                molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(molecule_ids) < 2)
                if len(molecule_ids) == 1:
                    molecule_tagged_reads += 1
                    molecule_id = molecule_ids[0]
                    tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id']))


        r.tags = tags
        bam_out.write(r)

    bam_out.close()
    outs.total_reads = total_reads
    outs.phased_reads = phased_reads
    outs.molecule_tagged_reads = molecule_tagged_reads