Ejemplo n.º 1
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end
    chunk_index = args.chunk_index

    prefixes = get_seqs(args.nbases)
    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(chunk_index==0))

    bams_out = {}
    for prefix in prefixes:
        filename = martian.make_path("bc_{}.bam".format(prefix))
        bams_out[prefix], _ = tk_bam.create_bam_outfile(filename, None, None, template=template)

    non_bc_bam = martian.make_path("bc_{}.bam".format(None))
    non_bc_bam_out, _ = tk_bam.create_bam_outfile(non_bc_bam, None, None, template=template)
    for read in tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)):
        barcode = crdna_io.get_read_barcode(read)
        if barcode is None:
            non_bc_bam_out.write(read)
        else:
            prefix = barcode[:args.nbases]
            bams_out[prefix].write(read)
    bam_in.close()

    non_bc_bam_out.close()
    sort_bam(non_bc_bam)
    outs.non_bc_bams = [non_bc_bam]

    outs.buckets = {}
    for prefix in prefixes:
        filename = bams_out[prefix].filename
        bams_out[prefix].close()
        sort_bam(filename)
        outs.buckets[prefix] = filename
Ejemplo n.º 2
0
def main_bucket_reads_by_bc(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    prefixes = get_seqs(args.nbases)

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for prefix in prefixes:
        filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in)
        bams_out[prefix] = bam_out
        outs.buckets[prefix] = filename
        buckets[prefix] = []

    non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default,
                                                  None,
                                                  None,
                                                  template=bam_in)
    non_bc_reads = []
    for r in reads:
        barcode = tk_io.get_read_barcode(r)
        if barcode is None:
            non_bc_bam_out.write(r)
            non_bc_reads.append(r)
        else:
            prefix = barcode[:args.nbases]
            buckets[prefix].append(r)
    non_bc_bam_out.close()

    # Set random seed to get deterministic qname subsampling
    random.seed(0)
    sampled_non_bc_reads = random.sample(non_bc_reads,
                                         min(len(non_bc_reads), len(prefixes)))
    outs.qnames = [read.qname for read in sampled_non_bc_reads]

    for prefix, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[prefix]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Ejemplo n.º 3
0
def process_alignments(genome_bam_file, trimmed_bam_file, out_bam_file, bam_comments, reporter, gene_index, star_index, args):
    in_genome_bam = tk_bam.create_bam_infile(genome_bam_file)
    bam_open_time = time.time()
    in_trimmed_bam = tk_bam.create_bam_infile(trimmed_bam_file) if trimmed_bam_file else None
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_file, None, None, template=in_genome_bam, cos=bam_comments)
    bam_iter = cr_utils.iter_by_qname(in_genome_bam, in_trimmed_bam)
    num_alignments = 0
    read_consume_time = None
    for qname, reads_iter, trimmed_iter in bam_iter:
        reads = list(reads_iter)
        if read_consume_time is None:
            read_consume_time = time.time()
            # if streaming, verify we're actually streaming
            print "Time to first read: %f seconds" % (read_consume_time - bam_open_time)
        num_alignments += len(reads)
        trimmed = list(trimmed_iter)
        trimmed_read = trimmed[0] if len(trimmed) > 0 else None
        for read in process_qname(qname, reads, trimmed_read, reporter, gene_index, star_index, args):
            out_bam.write(read)

    in_genome_bam.close()
    if in_trimmed_bam: in_trimmed_bam.close()
    out_bam.close()

    return num_alignments
Ejemplo n.º 4
0
def update_mapqs(bamfilename, outfile, reference_path):
    bam = tk_bam.create_bam_infile(bamfilename)
    bam_out, _ = tk_bam.create_bam_outfile(outfile, None, None, template=bam)
    variant_heap = []
    variant_map = {}
    read_heap = []
    primary_contigs = tk_reference.load_primary_contigs(reference_path)
    for read in bam:
        tags = [(key, value) for (key, value) in dict(read.tags).iteritems()]
        tags.append(('OM', int(read.mapq)))
        read.tags = tags
        if bam.references[read.tid] not in primary_contigs:
            read.tags = [(key, value) for (key, value) in read.tags
                         if key != 'AC' and key != 'XC']
            bam_out.write(read)
            continue
        add_variant_counts(read, variant_heap, variant_map)
        heapq.heappush(read_heap, (read.pos, read))
        update_updatable(read_heap, read.pos, variant_heap, variant_map,
                         bam_out)
    update_updatable(read_heap,
                     500000000,
                     variant_heap,
                     variant_map,
                     bam_out,
                     empty_me=True)
    bam_out.close()
Ejemplo n.º 5
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)))

    tmp_dir = os.path.dirname(outs.default)
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for qname in args.qnames:
        filename = os.path.join(tmp_dir, "qname_%s.bam" % qname)
        bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in)
        bams_out[qname] = bam_out
        outs.buckets[qname] = filename
        buckets[qname] = []

    qname_ranges = zip(args.qnames, args.qnames[1:])
    for r in reads:
        qname = None
        for qnames in qname_ranges:
            if qnames[0] <= r.qname and r.qname <= qnames[1]:
                qname = qnames[0]
                break
        if qname is None:
            qname = args.qnames[-1]
        buckets[qname].append(r)

    for qname, bucket in buckets.iteritems():
        bucket.sort(key=bc_sort_key)
        bam_out = bams_out[qname]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Ejemplo n.º 6
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.input_bam)
    out_bam, _ = tk_bam.create_bam_outfile(outs.output,
                                           None,
                                           None,
                                           template=in_bam)
    cell_bcs = set(cr_utils.load_barcode_tsv(args.cell_barcodes))

    for (tid, pos), reads_iter in itertools.groupby(in_bam,
                                                    key=cr_utils.pos_sort_key):
        dupe_keys = set()
        for read in reads_iter:
            if cr_utils.get_read_barcode(read) not in cell_bcs:
                continue

            if cr_utils.is_read_dupe_candidate(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                dupe_key = (cr_utils.si_pcr_dupe_func(read),
                            cr_utils.get_read_umi(read))
                if dupe_key in dupe_keys:
                    continue

                dupe_keys.add(dupe_key)
                out_bam.write(read)
Ejemplo n.º 7
0
def main(args, outs):
    outs.coerce_strings()
    bam_in = tk_bam.create_bam_infile(args.bucket[0])
    bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "sort_reads_by_bc"))
    bam_in.close()

    outs.total_reads = merge_by_key(args.bucket, bc_sort_key, bam_out)
    bam_out.close()
Ejemplo n.º 8
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.input)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam,
                                         (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.output,
                                           None,
                                           None,
                                           template=in_bam)

    chroms = in_bam.references
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  chroms=chroms)

    for (gg, bc, gene_ids), reads_iter in itertools.groupby(
            in_bam_chunk, key=cr_utils.barcode_sort_key):
        # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene
        if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1:
            for read in reads_iter:
                reporter.mark_dupes_corrected_cb(read)
                out_bam.write(read)
            continue

        reads = list(reads_iter)
        gene_id = gene_ids[0]

        # Count cDNA PCR duplicates with uncorrected UMIs
        dupe_key_umi_counts = mark_dupes(
            bc, gene_id, reads, args,
            cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE,
            cr_utils.cdna_pcr_dupe_func, reporter)

        # Record UMI corrections
        umi_corrections = correct_umis(dupe_key_umi_counts)

        # Mark duplicates for cDNA PCR duplicates with corrected UMIs
        mark_dupes(bc,
                   gene_id,
                   reads,
                   args,
                   cr_constants.CDNA_PCR_DUPE_TYPE,
                   cr_utils.cdna_pcr_dupe_func,
                   reporter,
                   corrected_dupe_keys=umi_corrections,
                   out_bam=out_bam)

        # Count duplicates for SI PCR duplicates with uncorrected UMIs
        mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE,
                   cr_utils.si_pcr_dupe_func, reporter)

    in_bam.close()
    out_bam.close()
    reporter.save(outs.chunked_reporter)
Ejemplo n.º 9
0
def main(args, outs):
    outs.coerce_strings()
    bam_in = tk_bam.create_bam_infile(args.bucket[0])
    bam_out, _ = tk_bam.create_bam_outfile(outs.default,
                                           None,
                                           None,
                                           template=bam_in)
    bam_in.close()

    outs.total_reads = merge_by_key(args.bucket, bc_and_qname_sort_key,
                                    bam_out)
    bam_out.close()
Ejemplo n.º 10
0
def main(args, outs):
    bam_in = tk_bam.create_bam_infile(args.chunk_input)

    # Get gem groups
    library_info = rna_library.get_bam_library_info(bam_in)
    gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info)))

    # Define buckets
    bucket_names = []
    prefixes = cr_utils.get_seqs(args.nbases)
    for gg in gem_groups:
        for prefix in prefixes:
            bucket_names.append('%s-%d' % (prefix, gg))
    bucket_names.append('')

    # Read all records
    reads = [read for read in bam_in]

    # Bucket the records
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for bucket_name in bucket_names:
        filename = martian.make_path("bc-%s.bam" % bucket_name)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in,
                                               rgs=args.read_groups,
                                               replace_rg=True)

        bams_out[bucket_name] = bam_out
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for r in reads:
        barcode = cr_utils.get_read_barcode(r)
        if barcode is None:
            bucket_name = ''
        else:
            barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
            prefix = barcode_seq[:args.nbases]
            bucket_name = '%s-%d' % (prefix, gem_group)
        buckets[bucket_name].append(r)

    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=cr_utils.barcode_sort_key)
        bam_out = bams_out[bucket_name]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Ejemplo n.º 11
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.possorted_bam)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam, (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.filtered_bam, None, None, template=in_bam)
    cluster_bcs = set(args.cluster_bcs)

    for (tid, pos), reads_iter in itertools.groupby(in_bam_chunk, key=cr_utils.pos_sort_key):
        dupe_keys = set()
        for read in reads_iter:
            if cr_utils.get_read_barcode(read) not in cluster_bcs:
                continue

            if cr_utils.is_read_dupe_candidate(read, cr_utils.get_high_conf_mapq({"high_conf_mapq":60})):
                dupe_key = (cr_utils.si_pcr_dupe_func(read), cr_utils.get_read_umi(read))
                if dupe_key in dupe_keys:
                    continue

                dupe_keys.add(dupe_key)
                read.is_duplicate = False
                out_bam.write(read)
Ejemplo n.º 12
0
def main(args, outs):
    prefixes = cr_utils.get_seqs(args.nbases)
    prefixes.append('')

    bam_in = tk_bam.create_bam_infile(args.chunk_input)
    reads = [read for read in bam_in]

    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for prefix in prefixes:
        filename = martian.make_path("bc_%s.bam" % prefix)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in,
                                               rgs=args.read_groups,
                                               replace_rg=True)

        bams_out[prefix] = bam_out
        outs.buckets[prefix] = filename
        buckets[prefix] = []

    for r in reads:
        barcode = cr_utils.get_read_barcode(r)
        if barcode is None:
            prefix = ''
        else:
            prefix = barcode[:args.nbases]
        buckets[prefix].append(r)

    for prefix, bucket in buckets.iteritems():
        bucket.sort(key=cr_utils.barcode_sort_key)
        bam_out = bams_out[prefix]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
Ejemplo n.º 13
0
def main(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    template = BamTemplateShim(bam_in, keep_comments=(args.chunk_index==0))
    
    if args.write_bam:
        bam_prefix, ext = os.path.splitext(outs.output)
        out_bam_name = bam_prefix + '_five_prime_pos_sorted' + ext
        bam_out, _ = tk_bam.create_bam_outfile(out_bam_name, None, None, template=template,
                                               pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(),
                                                                          "mark_duplicates")])
        outs.index = None # chunk bams don't get indexed
    else:
        bam_out = None
        outs.output = None
        outs.index = None

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [crdna_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
    else:
        no_filter_dups_bcs =    DupSummary(False, 1.0, True,  "no_filter_full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold)
        no_filter_dups_no_bcs = DupSummary(False, 1.0, False, "no_filter_full_ignore_bcs", lane_coord_sys, output_bam=bam_out, threshold=args.diffusion_threshold)


    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys, threshold=args.diffusion_threshold, tag_counts=True)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs", lane_coord_sys, threshold=args.diffusion_threshold)

    dup_sums = [full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs, no_filter_dups_no_bcs]

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [x.read_consumer() for x in dup_sums if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    if bam_out:
        bam_out.close()
        # Note - the indexing happens in join
        bam_prefix, _ = os.path.splitext(outs.output)
        tk_bam.sort(out_bam_name, bam_prefix)

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups, custom_diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_old_" + desc
        dup_results[diff_desc] = diff_dups
        custom_diff_desc = "diffusion_" + desc
        dup_results[custom_diff_desc] = custom_diff_dups

    if outs.duplicate_summary:
        with open(outs.duplicate_summary, 'w') as f:
            json.dump(dup_results, f, indent=4)
Ejemplo n.º 14
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    assert paired_end != (args.read2_chunk is None)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)

    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta,
                            filter_bam, strand, args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bam2 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end)
    bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                           None,
                                           None,
                                           template=bam1)

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            paired_end)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    # Copy the input barcodes
    if args.barcodes_chunk is not None:
        cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks)
    else:
        outs.barcodes_in_chunks = None

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 15
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    bam_in = create_bam_infile(args.align_chunk)

    bam_out, _ = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "attach_bcs", TENX_PRODUCT_NAME)
        ])

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = {
            bc: idx
            for (idx, bc) in enumerate(sorted(list(barcode_whitelist)))
        }

    # set random seed to get deterministic subsampling
    random.seed(0)

    if chunk['barcode'] is not None:
        processed_barcode_iter = get_raw_processed_barcodes(
            open_maybe_gzip(chunk['barcode']), barcode_whitelist,
            args.bc_confidence_threshold, chunk['gem_group'],
            chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index'] is not None:
        sample_index_iter = tk_fasta.read_generator_fastq(
            open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    if chunk['trim'] is not None:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(
            chunk['trim']),
                                                  paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter,
                           trim_iter)

    # First read
    try:
        read = bam_in.next()
    except StopIteration:
        read = None

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info is not None:
            (bc_read_name, raw_bc_seq, processed_bc_seq,
             raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]

        if sample_index_info is not None:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name is not None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info(
                        "mismatch: si_read_name: %s, bam_read_name: %s" %
                        (si_read_name, read_name))
                assert (si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(r1_tags)

        if trim_info is not None:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name,
             trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.query_name == read_name or read_name is None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if reads_to_attach and (
                    read.query_name != reads_to_attach[0].query_name
                    or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not (get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not (get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)
        # We may have more than 2 reads if there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert (reads_attached >= 2)

    outs.perfect_read_count = perfect_read_count
    bam_out.close()
Ejemplo n.º 16
0
def main(args, outs):
    chunk_start = args.chunk_start
    chunk_end = args.chunk_end

    bam_in = tk_bam.create_bam_infile(args.input)
    reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))
    pgs = [
        tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"),
        tk_bam.make_terminal_pg_header(martian.get_pipelines_version())
    ]
    # dont duplicate header if already there, this is for developer testing purposes
    PG = bam_in.header['PG']
    for item in PG:
        if item['ID'] == "attach_phasing":
            pgs = []
    bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs)

    # File with contig phasing information
    if args.fragment_phasing is not None:
        frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing)
    else:
        frag_phasing = None

    if args.fragments is not None:
        # Fragments file for global molecule id
        frag_id_reader = tk_hdf5.DataFrameReader(args.fragments)
    else:
        frag_id_reader = None

    # Phasing data
    ph_db = None
    ph_db_chrom = None
    ph_db_start = None
    ph_db_end = None

    # Fragment data - for global molecule id
    fr_db = None
    fr_db_chrom = None
    fr_db_start = None
    fr_db_end = None

    total_reads = 0
    phased_reads = 0
    molecule_tagged_reads = 0

    for r in reads:
        chrom = bam_in.references[r.tid]
        pos = r.pos
        bc = tk_io.get_read_barcode(r)

        total_reads += 1
        tags = r.tags

        # Strip out RX and QX tags
        #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG]
        # Actually don't strip
        strip_tags = []
        tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)]

        # Fetch from the fragments file to get records that should cover many future reads
        # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip
        if frag_phasing is not None:
            if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end:
                ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and ph_db.has_key(bc):
                frags = ph_db[bc]
                # See if we having phasing for this fragment
                valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(valid_phasing) < 2)
                if len(valid_phasing) == 1:
                    phased_reads += 1
                    read_phasing = valid_phasing[0]
                    tags.append((PHASE_SET_BAM_TAG, read_phasing['ps']))
                    tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap']))
                    tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc']))

        if frag_id_reader is not None:
            # Fetch from the fragments file to get records that should cover many future reads
            if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end:
                fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE)

            if bc is not None and fr_db.has_key(bc):
                frags = fr_db[bc]
                # See if we having phasing for this fragment
                molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos]
                assert(len(molecule_ids) < 2)
                if len(molecule_ids) == 1:
                    molecule_tagged_reads += 1
                    molecule_id = molecule_ids[0]
                    tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id']))


        r.tags = tags
        bam_out.write(r)

    bam_out.close()
    outs.total_reads = total_reads
    outs.phased_reads = phased_reads
    outs.molecule_tagged_reads = molecule_tagged_reads
Ejemplo n.º 17
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    #subsample_rate = 1.0
    #if args.subsample_rate is not None:
    #    subsample_rate = args.subsample_rate

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs"))

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        reads_attached = 0
        #emit_read_pair = random.random() < subsample_rate
        emit_read_pair = True

        while read.qname == read_name or read_name == None:
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            reads_attached += 1
            if not (read_name is None):
                assert(read.qname == read_name)

            if emit_read_pair:
                # Count the perfect reads -- will be used when subsampling in dedup
                if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent):
                    perfect_read_count += 1

                if args.exclude_non_bc_reads:
                    if not(tk_io.get_read_barcode(read) is None):
                        bam_out.write(read)
                else:
                    bam_out.write(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
Ejemplo n.º 18
0
def main(args, outs):
    """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics.
    PCR duplicates have the same read1 start site and read2 start site.
    """
    args.coerce_strings()
    outs.coerce_strings()

    # Chunk output doesn't get indexed
    outs.fragments_index = None
    outs.index = None

    # Pull in prior likelihoods for barcodes
    raw_barcode_abundance = None
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
    if args.raw_barcode_counts is not None and barcode_whitelist is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        raw_barcode_abundance = {
            '{}-{}'.format(barcode, gem_group): count
            for gem_group, subdict in raw_counts.iteritems()
            for barcode, count in zip(barcode_whitelist, subdict['bc_counts'])
        }

    bam_in = create_bam_infile(args.input)
    bam_refs = bam_in.references

    bam_prefix, ext = os.path.splitext(outs.output)
    raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' +
                                     ext)

    frag_prefix, ext = os.path.splitext(outs.fragments)
    raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext)

    # only write CO line for one chunk, so we don't have duplicates after samtools merge
    if args.chunk_num == 0:
        COs = [
            '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)',
            '10x_bam_to_fastq_seqnames:R1,R3,I1,R2'
        ]
    else:
        COs = None

    bam_out, _ = tk_bam.create_bam_outfile(
        raw_bam_file,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "mark_duplicates", TENX_PRODUCT_NAME)
        ],
        cos=COs)
    fragments_out = open(raw_frag_file, 'w')
    bam_in.reset()

    # Ensure the summary key indicates what kind of dup marking was actually performed.
    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)
    reference_manager = ReferenceManager(args.reference_path)
    summarizer = DupSummary(split_bcs=False,
                            lane_coordinate_system=lane_coord_sys,
                            output_bam=bam_out,
                            output_tsv=fragments_out,
                            ref=reference_manager,
                            bam_refs=bam_refs,
                            priors=raw_barcode_abundance)

    # Now broadcast the selected reads to the summarizers
    consumers = [summarizer.read_consumer()]
    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Close outfiles
    bam_out.close()
    fragments_out.close()

    # Feed the chunk barcode_counts data back to join()
    with open(outs.singlecell_mapping, 'w') as outfile:
        pickle.dump(summarizer.bc_counts, outfile)

    # Sort the output bam & tsv files
    sort_bam(raw_bam_file,
             outs.output,
             threads=martian.get_threads_allocation())
    sort_bed(raw_frag_file,
             outs.fragments,
             genome=reference_manager.fasta_index,
             threads=martian.get_threads_allocation(),
             leave_key=True)
Ejemplo n.º 19
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """
    # this silences a weird non-failure in --strict=error mode
    # TODO(lhepler): remove this when martian upstream handles this itself
    outs.outputs = []

    chunk = args.chunk

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG)

    # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to:
    # samtools merge ... : '[finish_merged_header] Output header text too long'
    if args.chunk_index > 0:
        COs = None
    elif chunk['trim']:
        COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']
    else:
        COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']

    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs)

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['trim']:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(tags)

        if trim_info:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.qname == read_name or read_name == None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if not (read_name is None):
                assert(read.qname == read_name)

            if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not(crdna_io.get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not(crdna_io.get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
Ejemplo n.º 20
0
 def open_file(self, filename):
     # Create a dummy header to prevent samtools / pysam crashing
     return tk_bam.create_bam_outfile(filename, ['dummy'], [8])[0]
Ejemplo n.º 21
0
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args):
    """Build a consensus sequence from a set of contigs.

    Args:
    - clonotype_name: Used to prefix output files.
    - sel_contigs: Names of contigs to use for consensus building.
    - best_contig: Name of "best" contig. Will search for this contig's sequence
        and base qualities.
    - out_dir: dir used for temporary results
    - args: stage args.

    - Return value:
    A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name).
    - best_contig_seq/best_contig_quals: the sequence and quals of the best contig
    - consensus_seq: the consensus sequence or None if no consensus could be built.
    - out_bam_name: Path of BAM with alignments of contigs to consensus seq.
    - out_fastq_name: FASTQ with contig sequences.
    - out_fasta_name: FASTA with consensus sequence.
    enough reads for consensus.
    """

    best_contig_seq = None
    best_contig_quals = None

    # Input to base quality computation - we don't really need the
    # base qualities because we will replace them by read-based qualities
    # But we need to do this to get proper alignments of contigs against
    # the consensus.
    out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq')

    # Input to assembly
    out_bam_name = martian.make_path(clonotype_name + '_contigs.bam')

    # The reference in the output bam doesn't really matter.
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1])

    # Read the entire fastq (all contigs) and write the selected contigs to
    # a bam for the assembler and a fastq for the aligner.
    with open(args.contigs_fastq, 'r') as f, open(out_fastq_name,
                                                  'w') as out_fq:
        fq_iter = tk_fasta.read_generator_fastq(f)
        for (name, seq, quals) in fq_iter:
            if name in sel_contigs:
                if name == best_contig:
                    best_contig_seq = seq
                    best_contig_quals = quals

                header = cr_fastq.AugmentedFastqHeader(name)
                # Create a pseudo-UMI for each input contig
                header.set_tag(PROCESSED_UMI_TAG, name)
                # Put all reads on the same "barcode". This is important, so
                # the assembler assembles all of them together.
                header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name)

                record = pysam.AlignedRead()

                record.reference_start = 0
                record.reference_id = 0
                # Wrap with str() or pysam will crash when given unicode
                record.qname = str(header.to_string())
                record.seq = seq
                record.qual = quals
                record.flag = MAPPED_UNPAIRED_FLAG

                out_bam.write(record)

                # Now change the tags. The final bam concatenation code will pull
                # the tags out of the header, so we want these to be meaningful.
                # Put the real barcode in the barcode tag. The alignment-base-qual
                # code will ignore it anyway.
                header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0])
                tk_fasta.write_read_fastq(out_fq, header.to_string(), seq,
                                          quals)

    out_bam.close()
    assert (not best_contig_seq is None)

    out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta')

    # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name.
    # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the
    # only output of the assembler we care about.
    if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS:
        cmd = [
            'vdj_asm',
            'asm',
            out_bam_name,
            out_dir,
            '--single-end',
            '--cons',  # required so we produce a single output sequence
            '--kmers=0',
            '--min-qual=0',
            '--score-factor=0.0'
        ]
        sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

        tk_subproc.check_call(cmd, cwd=os.getcwd())

        with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'),
                  'r') as contig_f:
            lines = contig_f.readlines()
            if lines:
                out_seq = lines[1].strip()
            else:
                # In some rare cases (eg. input contigs have 0 quality), assembly might fail.
                out_seq = None
    else:
        out_seq = None

    # Write the best contig sequence on a new fasta. We need to make sure this has the
    # right contig name because this will be the name written in the bam alignments
    # of the contigs against the consensus
    with open(out_fasta_name, 'w') as f:
        tk_fasta.write_read_fasta(f, clonotype_name,
                                  out_seq if out_seq else best_contig_seq)

    # Now align the same reads that were used in vdj_asm against the consensus that you just got.
    # The output will be in out_dir/<clonotype_name> + '_contigs.bam'
    cmd = [
        'vdj_asm', 'base-quals',
        martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end'
    ]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    # Move the BAM of the contigs aligned against the consensus out of the outs
    # (Will overwrite this bam which was already used as input to assembly).
    cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'),
               out_bam_name)

    return (best_contig_seq, best_contig_quals, out_seq, out_bam_name,
            out_fastq_name, out_fasta_name)
Ejemplo n.º 22
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)
    fq_prefix = re.sub('_1.fastq', '', args.read1_chunk)
    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    run_read_match(fq_prefix, ref_fasta, filter_bam, args.chemistry_def,
                   args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = tk_bam.create_bam_infile(filter_bam)
    bam2 = tk_bam.create_bam_infile(filter_bam)
    bc_iter1 = get_bc_grouped_pair_iter(bam1)
    bc_iter2 = get_bc_grouped_pair_iter(bam2)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    if args.output_fastqs:
        out_fastq1 = open(outs.barcode_chunked_read1, 'w')
        out_fastq2 = open(outs.barcode_chunked_read2, 'w')
        out_bam = None
    else:
        out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                               None,
                                               None,
                                               template=bam1)
        out_fastq1 = None
        out_fastq2 = None

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            if is_mapped(read1, read2):
                umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        write_barcode_fastq(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            out_fastq1, out_fastq2)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    if args.output_fastqs:
        out_fastq1.close()
        out_fastq2.close()
    else:
        out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 23
0
def main_mark_duplicates(args, outs):
    """
    Mark exact duplicate reads in the BAM file. Duplicates have the same read1 start site and read2 start site
    """

    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)

    args.coerce_strings()
    outs.coerce_strings()

    bam_in = tk_bam.create_bam_infile(args.input)
    bam_out, tids = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pg=tk_bam.make_pg_header(martian.get_pipelines_version(),
                                 "mark_duplicates"))

    # Determine whether the BAM has 10x barcodes
    bam_in.reset()
    has_barcodes = [
        tk_io.read_has_barcode(x) for x in itertools.islice(bam_in, 1000)
    ]
    have_barcodes = (float(sum(has_barcodes)) / len(has_barcodes)) > 0.1

    # We do the subsampling to achieve the desired coverage on _perfect reads_, as
    # defined by tenkit.read_filter.stringent_read_filter.  This is tallied in ATTACH_BCS,
    # and passed into the perfect_read_count argument.  We will fail if it's not supplied.
    total_coverage = args.estimated_coverage

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    sampling_rates = []
    for sample_cov in DUPLICATE_SUBSAMPLE_COVERAGES:
        rate = tk_stats.robust_divide(float(sample_cov), total_coverage)
        sampling_rates.append((rate, sample_cov))

    # All read duplicate marking - these dup decisions are written to bam_out
    # the output bam has BC aware dup marking if available.
    # Ensure the summary key indicates what kind of dup marking was actually performed.
    if have_barcodes:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys, bam_out)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           write_to_stdout=False)
    else:
        no_filter_dups_bcs = DupSummary(False, 1.0, True,
                                        "no_filter_full_use_bcs",
                                        lane_coord_sys)
        no_filter_dups_no_bcs = DupSummary(False,
                                           1.0,
                                           False,
                                           "no_filter_full_ignore_bcs",
                                           lane_coord_sys,
                                           bam_out,
                                           write_to_stdout=False)

    # Dup marking on all perfect reads
    full_dups_bcs = DupSummary(True, 1.0, True, "full_use_bcs", lane_coord_sys)
    full_dups_no_bcs = DupSummary(True, 1.0, False, "full_ignore_bcs",
                                  lane_coord_sys)

    # Make a battery of duplicate summaries at different coverages, with and w/o
    # barcode splitting
    split_options = [True, False]

    dup_sums = [
        full_dups_bcs, full_dups_no_bcs, no_filter_dups_bcs,
        no_filter_dups_no_bcs
    ]

    # Duplicate marking on perfect reads subsampled to the requested coverage
    for (sr, cov) in sampling_rates:
        for split_bc in split_options:
            description = "cov_" + str(cov) + ('_use_bcs'
                                               if split_bc else '_ignore_bcs')
            dup_sums.append(
                DupSummary(True, sr, split_bc, description, lane_coord_sys))

    # Now broadcast the selected reads to the summarizers
    # We can't do the points the require a sample_rate > 1.0 so, skip those.
    # If we don't have barcodes, don't run the set that are split by barcode.
    consumers = [
        x.read_consumer() for x in dup_sums
        if x.sample_rate <= 1.0 and ((not x.split_bcs) or have_barcodes)
    ]

    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # We close the BAM
    bam_out.close()
    # Note - the indexing happens in join

    # Package up the summaries:
    dup_results = {}
    for x in dup_sums:
        (dups, optical_dups, diff_dups) = x.result
        desc = x.description
        dup_results[desc] = dups
        optical_desc = "optical_" + desc
        dup_results[optical_desc] = optical_dups
        diff_desc = "diffusion_" + desc
        dup_results[diff_desc] = diff_dups

    if outs.duplicate_summary:
        f = open(outs.duplicate_summary, 'w')
        json.dump(dup_results, f)
        f.close()