def main(args, outs): in_bam = tk_bam.create_bam_infile(args.possorted_bam) chrom = args.chrom poses = [] mol_qs = [] bcs = [] for read in in_bam.fetch(str(chrom), int(args.start_pos), int(args.end_pos)): if not read.is_secondary and not read.is_duplicate and read.is_read1 and \ not read.is_unmapped and read.mapq >= args.mapq: poses.append(read.pos) mol_qs.append(tk_io.get_read_molecule_conf(read)) bcs.append(tk_io.get_read_barcode(read)) ret_df = pd.DataFrame({ 'chrom': chrom, 'pos': poses, 'bc': bcs, 'mol_qual': mol_qs }) if len(ret_df) > 0: start_pos = poses[0] end_pos = poses[-1] cov_df = tk_hdf5.read_data_frame_indexed( args.coverage, [(chrom, start_pos, end_pos + 1)]) # Boolean array with length equal to the range of positions in ret_df on_target = np.zeros((end_pos - start_pos + 1, ), dtype=np.bool) on_target[cov_df.pos - start_pos] = True cum_on_target = np.cumsum(on_target) ret_df['on_target'] = on_target[ret_df.pos - start_pos] # Note that fraction is set to 1 if there are no bases frac_on_target = np.ones((len(ret_df), )) * on_target[0] for i, p in enumerate(poses): if i > 0: nbp = float(p - poses[i - 1] - 1) if nbp > 0: frac_on_target[i] = ( cum_on_target[p - start_pos] - cum_on_target[poses[i - 1] - start_pos] - int(on_target[p - start_pos])) / nbp else: frac_on_target[i] = float(on_target[p - start_pos]) ret_df['frac_on_target'] = frac_on_target else: ret_df['on_target'] = False ret_df['frac_on_target'] = 0.0 tk_hdf5.write_data_frame(outs.reads, ret_df)
def main_bucket_reads_by_bc(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end prefixes = get_seqs(args.nbases) bam_in = tk_bam.create_bam_infile(args.input) reads = list(tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end))) tmp_dir = os.path.dirname(outs.default) bams_out = {} outs.buckets = {} buckets = {} for prefix in prefixes: filename = os.path.join(tmp_dir, "bc_%s.bam" % prefix) bam_out, _ = tk_bam.create_bam_outfile(filename, None, None, template=bam_in) bams_out[prefix] = bam_out outs.buckets[prefix] = filename buckets[prefix] = [] non_bc_bam_out, _ = tk_bam.create_bam_outfile(outs.default, None, None, template=bam_in) non_bc_reads = [] for r in reads: barcode = tk_io.get_read_barcode(r) if barcode is None: non_bc_bam_out.write(r) non_bc_reads.append(r) else: prefix = barcode[:args.nbases] buckets[prefix].append(r) non_bc_bam_out.close() # Set random seed to get deterministic qname subsampling random.seed(0) sampled_non_bc_reads = random.sample(non_bc_reads, min(len(non_bc_reads), len(prefixes))) outs.qnames = [read.qname for read in sampled_non_bc_reads] for prefix, bucket in buckets.iteritems(): bucket.sort(key=bc_sort_key) bam_out = bams_out[prefix] for r in bucket: bam_out.write(r) bam_out.close()
def sort_by_bc(file_name, sorted_name, store_in_memory=True): """ Sorts a bam file by the 10X barcode (specified in the tags BC field) if store_in_memory is True, avoids file seeks by keeping reads in memory """ in_file = create_bam_infile(file_name) out_file, tids = create_bam_outfile(sorted_name, None, None, template=in_file) if store_in_memory: bc_reads = {} for read in in_file: bc = tk_io.get_read_barcode(read) this_bc_reads = bc_reads.setdefault(bc, []) this_bc_reads.append(read) sorted_bcs = sorted(bc_reads.keys()) for bc in sorted_bcs: for read in bc_reads[bc]: out_file.write(read) else: # Store the file offset locations (in bytes) by bc bc_locs = {} file_offset = in_file.tell() for read in in_file: bc = tk_io.get_read_barcode(read) this_bc_locs = bc_locs.setdefault(bc, []) this_bc_locs.append(file_offset) file_offset = in_file.tell() # has to be before next read sorted_bcs = sorted(bc_locs.keys()) for bc in sorted_bcs: for offset in bc_locs[bc]: in_file.seek(offset) out_file.write(in_file.next()) out_file.close()
def get_reads(in_bam, chrom, start, stop, min_mapq=60): bcs = [] poses = [] for read in in_bam.fetch(str(chrom), start, stop): mapq = read.mapq if mapq < min_mapq or read.is_secondary or read.is_duplicate: continue bc = tk_io.get_read_barcode(read) if bc is None: continue bcs.append(bc) poses.append(read.pos) df = pd.DataFrame({'pos': poses, 'bc': bcs}) df.sort('bc', inplace=True) return df
def get_bcs_at_region(in_bam, chrom, start, end, min_mapq=60, bc_map=None, other_chrom=None, other_start=None, other_end=None, read_to_bc=None): bc_list = {} for read in in_bam.fetch(str(chrom), int(start), int(end)): if read.mapq < min_mapq or read.is_duplicate or read.is_secondary: continue if read.pos < start or read.pos > end: continue if not other_start is None and not other_end is None: tag_names = [t[0] for t in read.tags] tag_vals = [t[1] for t in read.tags] if 'XA' in tag_names: idx = np.where(np.array(tag_names) == 'XA')[0] chrom = tag_vals[idx].split(',')[0] pos = int(tag_vals[idx].split(',')[1].strip('+-')) if chrom == other_chrom and pos >= other_start and pos <= other_end: continue if read_to_bc is None: bc = tk_io.get_read_barcode(read) else: bc = read_to_bc[read.qname] if not (bc is None or bc == '') and (bc_map is None or bc in bc_map): if not bc in bc_list: bc_list[bc] = [] bc_list[bc].append(read.qname) for bc in bc_list.keys(): bc_list[bc] = list(set(bc_list[bc])) return bc_list
def main_report_single_partition(args, outs): # Bail out if there no valid barcodes if args.barcode_whitelist is None or args.input is None: outs.fragments = None return bam_in = tk_bam.create_bam_infile(args.input) if args.targets_file is None: target_regions = None else: target_regions = tk_io.get_target_regions(open(args.targets_file)) # Bail out if we're on a small genome if sum(bam_in.lengths) < 3e6: outs.fragments = None return bam_chunk = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end)) # Skip reads without a barcode bam_chunk_filt = itertools.ifilter(read_has_barcode, bam_chunk) bc_read_iter = itertools.groupby(bam_chunk_filt, lambda x: tk_io.get_read_barcode(x)) bc_data = (summarize_barcode(bc, list(reads), args.read_link_distance, bam_in.references, target_regions) for (bc, reads) in bc_read_iter) bc_data_filt = (x for x in bc_data if x is not None) frag_tbl, bc_tbl = make_output_dataframes(bc_data_filt) if frag_tbl is not None: # Sort and index fragment table, so that we can combine the fragments files per-chromosome to reduce memory consumption frag_tbl.sort(['chrom', 'start_pos'], inplace=True) tenkit.hdf5.write_data_frame(outs.fragments, frag_tbl) tenkit.hdf5.create_tabix_index(outs.fragments, 'chrom', 'start_pos', 'end_pos') if bc_tbl is not None: tenkit.hdf5.write_data_frame(outs.barcodes, bc_tbl)
def main(args, outs): """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """ chunk = args.chunk #subsample_rate = 1.0 #if args.subsample_rate is not None: # subsample_rate = args.subsample_rate bam_in = tk_bam.create_bam_infile(args.align_chunk) bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")) if args.barcode_whitelist is None or args.bc_counts is None: # If there's no whitelist or counts then all high quality BC reads get allowed. barcode_whitelist = None wl_idxs = None bc_dist = None else: barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist) # Load the bc counts for this GEM group counts = json.load(open(args.bc_counts, 'r')) counts = counts[str(chunk['gem_group'])]['bc_counts'] # Prior distribution over barcodes, with pseudo-count bc_dist = np.array(counts, dtype=np.float) + 1.0 bc_dist = bc_dist / bc_dist.sum() wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) } # set random seed to get deterministic subsampling random.seed(0) def open_maybe_gzip(fn): if fn[-2:] == "gz": return gzip.open(fn) else: return open(fn) if chunk['barcode']: processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist) require_barcode_for_stringent = True else: processed_barcode_iter = itertools.repeat(None) require_barcode_for_stringent = False if chunk['sample_index']: sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index'])) else: sample_index_iter = itertools.repeat(None) iters = itertools.izip(processed_barcode_iter, sample_index_iter) # First read read = bam_in.next() # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates perfect_read_count = 0 # Due to secondary alignments, we must apply the tags to all # reads with the same cluster name. for (barcode_info, sample_index_info) in iters: tags = [] read_name = None if read is None: break if barcode_info: (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info tags.append((RAW_BARCODE_TAG, raw_bc_seq)) tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual)) if processed_bc_seq is not None: tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq)) read_name = bc_read_name.split()[0] if sample_index_info: (si_read_name, seq, qual) = sample_index_info tags.append((SAMPLE_INDEX_TAG, seq)) tags.append((SAMPLE_INDEX_QUAL_TAG, qual)) if read_name != None: if si_read_name.split()[0] != read_name: martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name)) assert(si_read_name.split()[0] == read_name) else: read_name = si_read_name.split()[0] reads_attached = 0 #emit_read_pair = random.random() < subsample_rate emit_read_pair = True while read.qname == read_name or read_name == None: if len(tags) > 0: existing_tags = read.tags existing_tags.extend(tags) read.tags = existing_tags reads_attached += 1 if not (read_name is None): assert(read.qname == read_name) if emit_read_pair: # Count the perfect reads -- will be used when subsampling in dedup if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent): perfect_read_count += 1 if args.exclude_non_bc_reads: if not(tk_io.get_read_barcode(read) is None): bam_out.write(read) else: bam_out.write(read) try: read = bam_in.next() except StopIteration: read = None break # We may have more than 2 reads is there was a # secondary alignment, but less than 2 means # something went wrong assert(reads_attached >= 2) outs.perfect_read_count = perfect_read_count bam_out.close()
def read_tuple(r): bc = tk_io.get_read_barcode(r) return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1)
def test_big_dedup(self): tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.000003, 0.000015] args = martian.Record({ 'input': IN_BAM_BIG, 'estimated_coverage': 100.0, 'perfect_read_count': 100000, 'chunk_start': None, 'chunk_end': None }) outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON }) main_mark_duplicates(args, outs) out_bam = pysam.Samfile(OUT_BAM) out_reads = list(out_bam) in_bam = pysam.Samfile(IN_BAM_BIG) in_reads = list(in_bam) # Check we haven't lost any reads self.assertEqual(len(out_reads), len(in_reads)) def read_tuple(r): bc = tk_io.get_read_barcode(r) return (bc, r.tid, r.pos, r.mrnm, r.mpos, r.is_reverse, r.is_read1) #return (bc, r.is_read1, r.is_reverse, r.tid, r.pos, r.mrnm, r.mpos) def mark_duplicates(read_set): # Re-run the dup analysis manually read_tups = [(read_tuple(r), r) for r in read_set] read_tups.sort(key=lambda x: x[0]) groups = itertools.groupby(read_tups, lambda x: x[0]) for (k, reads) in groups: rl = list(reads) rl[0][1].is_duplicate = False for i in range(1, len(rl)): rl[i][1].is_duplicate = True mark_duplicates(in_reads) # Make sure our 'all-reads' analysis matches the code out_dup_marks = np.array([ r.is_duplicate for r in out_reads if (not r.is_unmapped) and (not r.mate_is_unmapped) ]) test_dup_marks = np.array([ r.is_duplicate for r in in_reads if (not r.is_unmapped) and (not r.mate_is_unmapped) ]) print "len(start_bam): %d -- len(out_bam): %d" % (len(out_dup_marks), len(test_dup_marks)) eq = (out_dup_marks == test_dup_marks).all() print "mean dups code: %f" % out_dup_marks.mean() print "mean dups test: %f" % test_dup_marks.mean() self.assertTrue(eq) # Read the molecule count histogram and verify count_hist = json.load(file(OUT_JSON))['no_filter_full_use_bcs'] dups = sum([(int(times_observed) - 1) * n for (times_observed, n) in count_hist.items()]) total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ]) summary_dup_rate = float(dups) / total_reads mapped_in_reads = np.array([ r.is_duplicate for r in in_reads if not (r.is_unmapped or r.mate_is_unmapped) and tk_io.get_read_barcode(r) is not None ]) self.assertEqual(summary_dup_rate, mapped_in_reads.mean()) # Get the perfect reads, mark dups and compare stats perfect_reads = [ x for x in in_reads if tenkit.read_filter.stringent_read_filter(x, True) ] mark_duplicates(perfect_reads) # Read the molecule count histogram and verify -- perfect reads count_hist = json.load(file(OUT_JSON))['full_use_bcs'] dups = sum([(int(times_observed) - 1) * n for (times_observed, n) in count_hist.items()]) total_reads = sum([ int(times_observed) * n for (times_observed, n) in count_hist.items() ]) summary_dup_rate = float(dups) / total_reads mapped_in_reads = np.array([r.is_duplicate for r in perfect_reads]) self.assertEqual(summary_dup_rate, mapped_in_reads.mean())
def get_allele_read_info(chrom, pos, ref, alt_alleles, min_mapq_counts, min_mapq_for_mean, min_mapq_for_bc, default_indel_qual, bam, reference_pyfasta, max_reads=1000, match=1, mismatch=-4, gap_open=-6, gap_extend=-1): all_alleles = [ref] + alt_alleles bc_qual_maps = [{} for j in xrange(len(all_alleles))] counts = [0 for x in all_alleles] diffs = [[] for x in all_alleles] mapq_sums = [0.0 for x in all_alleles] mapq_denoms = [0.0 for x in all_alleles] molecule_differences = [[] for x in all_alleles] rescued = [[] for x in all_alleles] num_reads = 0 qnames = set() for read in bam.fetch(chrom, pos, pos + 1): num_reads += 1 if read.qname in qnames: continue qnames.add(read.qname) if read.is_duplicate: continue if num_reads > max_reads: break is_indel_variant = False for allele in alt_alleles: if len(allele) != len(ref): is_indel_variant = True allele_index_in_read = read_contains_allele_sw( ref, all_alleles, pos, read, reference_pyfasta[chrom], match=match, mismatch=mismatch, gap_open=gap_open, gap_extend=gap_extend) for (allele_index, allele) in enumerate(all_alleles): if allele_index == allele_index_in_read: if dict(read.tags).get("AS") is not None and dict( read.tags).get("XS") is not None: diffs[allele_index].append( float(dict(read.tags).get("AS")) - float(dict(read.tags).get("XS"))) if dict(read.tags).get('OM') is not None: if read.mapq >= 30 and dict(read.tags).get('OM') < 30: rescue = 1 else: rescue = 0 rescued[allele_index].append(rescue) if dict(read.tags).get("DM") is not None: molecule_differences[allele_index].append( float(dict(read.tags).get("DM"))) if read.mapq >= min_mapq_for_mean: mapq_sums[allele_index] += read.mapq mapq_denoms[allele_index] += 1 if read.mapq >= min_mapq_counts: counts[allele_index] += 1 if read.mapq >= min_mapq_for_bc: bc = tk_io.get_read_barcode(read) if bc is None: continue cigar_map = tk_seq.get_cigar_map(read.cigar) try: read_offset = cigar_map.index(pos - read.pos - 1) except: continue if allele == ref: if is_indel_variant: qual = str(default_indel_qual) else: qual = str( ord( min(read.qual[read_offset:read_offset + len(allele)]))) bc_quals = bc_qual_maps[allele_index].setdefault( bc, []) bc_quals.append(qual) # SNP elif len(allele) == 1 and len(ref) == 1: if is_indel_variant: qual = str(default_indel_qual) else: qual = str(ord(read.qual[read_offset])) bc_quals = bc_qual_maps[allele_index].setdefault( bc, []) bc_quals.append(qual) # Insert elif len(allele) > len(ref) and allele.startswith(ref): bc_quals = bc_qual_maps[allele_index].setdefault( bc, []) bc_quals.append(str(default_indel_qual)) # Deletion elif len(allele) < len(ref) and ref.startswith(allele): bc_quals = bc_qual_maps[allele_index].setdefault( bc, []) bc_quals.append(str(default_indel_qual)) else: bc_quals = bc_qual_maps[allele_index].setdefault( bc, []) bc_quals.append(str(default_indel_qual)) bc_qual_strings = [] for bc_qual_map in bc_qual_maps: bc_qual_strings.append([]) for bc, bc_quals in bc_qual_map.iteritems(): bc_qual_strings[-1].append(bc + '_' + '_'.join(bc_quals)) mapq_means = [ mapq_sums[i] / mapq_denoms[i] if mapq_denoms[i] > 0 else 31 for i in range(len(all_alleles)) ] return (counts, mapq_means, bc_qual_strings, molecule_differences, diffs, rescued)
def bc_key_func(read): return tk_io.get_read_barcode(read)
def create_bc_matrix_step(bam_filename, chrom, starts, stops, win, step, bc_map, min_mapq=30, read1_only=False, no_split=False): """ Creates a (BCs X Windows) sparse matrix of barcodes (columns) versus windowed locations """ step = min(max(1, step), win) in_bam = tk_bam.create_bam_infile(bam_filename) nsteps_per_win = int(np.ceil(win / float(step))) nbcs = len(bc_map.values()) bc_idx = [] win_idx = [] win_starts = [] win_stops = [] for start, stop in zip(starts, stops): nwin = int(np.ceil((stop - start) / float(step))) for read in in_bam.fetch(str(chrom), start, stop): mapq = read.mapq if mapq < min_mapq or (read1_only and read.is_read2 ) or read.is_duplicate or read.is_secondary: continue if no_split and ('SA' in [t[0] for t in read.tags]): continue bc = tk_io.get_read_barcode(read) if bc is None or bc == '' or not bc in bc_map: continue pos = read.pos if pos < start or pos > stop: continue last_win_idx = int(np.floor((pos - start) / float(step))) first_win_idx = max(0, last_win_idx - nsteps_per_win + 1) last_win_idx = min(last_win_idx, nwin - 1) first_win_idx = min(first_win_idx, nwin - 1) bc_idx.extend( [bc_map[bc] for i in range(first_win_idx, last_win_idx + 1)]) win_idx.extend([ i + len(win_starts) for i in range(first_win_idx, last_win_idx + 1) ]) win_starts.extend([start + i * step for i in range(nwin)]) win_stops.extend( [min(stop, start + i * step + win) for i in range(nwin)]) bc_idx = np.reshape(np.array(bc_idx, ndmin=2), (1, len(bc_idx))) win_idx = np.reshape(np.array(win_idx, ndmin=2), (1, len(win_idx))) bc_mat = sp.csc_matrix((np.ones((bc_idx.shape[1], ), dtype=np.float32), np.concatenate((bc_idx, win_idx), axis=0)), (nbcs, len(win_starts))).tolil() win_starts = np.array(win_starts).flatten() win_stops = np.array(win_stops).flatten() return (bc_mat, win_starts, win_stops)
def chunk_split_func(r): return tk_io.get_read_barcode(r)
def read_has_barcode(r): bc = tk_io.get_read_barcode(r) if bc is None: return False else: return True
def main(args, outs): """ Outputs barcode file """ args.coerce_strings() bam_in = tk_bam.create_bam_infile(args.input) unsorted_temp_name = martian.make_path(outs.contig_output + '_TEMPUNSORTED') sorted_temp_name = martian.make_path(outs.contig_output + '_TEMPSORTED') base_dir = os.path.dirname(outs.contig_output) unsorted_temp_file = open(unsorted_temp_name, 'w') contig_output_file = open(outs.contig_output, 'w') window_size = args.window_size chroms = bam_in.references # Output the raw poses unsorted_temp_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ']) + '\n') if args.restrict_locus is None: bam_iter = bam_in.fetch(args.chrom) else: restrict_chrom, restrict_start, restrict_stop = tk_io.get_locus_info( args.restrict_locus) assert (args.chrom == restrict_chrom) bam_iter = bam_in.fetch(restrict_chrom, restrict_start, restrict_stop) for read in bam_iter: chrom = chroms[read.tid] start = read.pos end = read.aend if end is None: end = start + len(read.seq) bc = tk_io.get_read_barcode(read) if not (bc is None): unsorted_temp_file.write( '\t'.join([chrom, str(start), str(end), bc]) + '\n') # Sort the poses unsorted_temp_file.close() tk_tabix.sort_bc_loc_tabix(unsorted_temp_name, sorted_temp_name, temp_dir_name=base_dir) # Infer the contig locations # This header is written during join #contig_output_file.write('\t'.join(['#CHROM', 'START', 'END', 'BC_SEQ', 'NUM_READS']) + '\n') sorted_temp_file = open(sorted_temp_name, 'r') sorted_temp_file.readline() old_bc_seq = None bc_poses = [] for line in sorted_temp_file: (chrom, start, end, bc_seq) = line.strip('\n').split('\t') start = int(start) end = int(end) if not (bc_seq == old_bc_seq): if not (old_bc_seq is None): frags = infer_fragments(bc_poses, window_size) for (frag_chrom, frag_start, frag_end, num_reads) in frags: contig_output_file.write('\t'.join([ frag_chrom, str(frag_start - BUFFER), str(frag_end + BUFFER), old_bc_seq, str(num_reads) ]) + '\n') bc_poses = [] old_bc_seq = bc_seq bc_poses.append((chrom, start, end)) # Output for the last barcode if not (old_bc_seq is None): frags = infer_fragments(bc_poses, window_size) for (frag_chrom, frag_start, frag_end, num_reads) in frags: contig_output_file.write('\t'.join([ frag_chrom, str(frag_start - BUFFER), str(frag_end + BUFFER), old_bc_seq, str(num_reads) ]) + '\n') sorted_temp_file.close() subprocess.check_call(['rm', sorted_temp_name]) subprocess.check_call(['rm', unsorted_temp_name]) contig_output_file.close()
def bc_sort_key(read): return (tk_io.get_read_barcode(read), read.qname)
def process_read_block(reads, count_hist, optical_dup_hist, diffusion_dup_hist): ''' dedup a block of reads, then write to BAM in original order ''' read_tuples = [] for read in reads: # Don't consider unmapped reads if read.is_unmapped: continue bc_sequence = None if self.split_bcs: bc_sequence = tk_io.get_read_barcode(read) read_tup = (bc_sequence, read.is_read1, read.is_reverse, read.tid, read.pos, read.mrnm, read.mpos) # Include the read and the original index so that we can get back to the original order read_tuples.append((read_tup, read, len(read_tuples))) # Sort by the read tuple -- need to do this for the groupby to get all the common items together read_tuples.sort(key=lambda x: x[0]) # Both reads in a pair must be mapped, otherwise we drop the pair mapped_pair_tuples = itertools.ifilter( lambda x: (not x[1].is_unmapped) and (not x[1].mate_is_unmapped), read_tuples) # Group the reads by the read_tuple dup_groups = itertools.groupby(mapped_pair_tuples, lambda x: x[0]) for (key, dup_group) in dup_groups: # Note how many dups we have dup_group = list(dup_group) n_dups = len(dup_group) if n_dups > 1: optical_dups, diffusion_dups = self.count_dups_by_distance( dup_group) else: optical_dups = 0 diffusion_dups = 0 # diffusion dups encompass optical dups, if non_proximal_dups = n_dups - max(diffusion_dups, optical_dups) # If we are splitting on bcs, then only counts stats for read groups with BCs group_bc = key[0] if not self.split_bcs or group_bc is not None: count_hist[non_proximal_dups] = count_hist.setdefault( non_proximal_dups, 0) + 1 if optical_dups > 0: optical_dup_hist[ 'count'] = optical_dup_hist.setdefault( 'count', 0) + optical_dups diffusion_dup_hist[ 'count'] = diffusion_dup_hist.setdefault( 'count', 0) + diffusion_dups # Mark dups if self.output_bam: # mark the first read in the set as not-a-dup dup_group[0][1].is_duplicate = False for i in range(1, n_dups): dup_group[i][1].is_duplicate = True if self.output_bam: for read in reads: self.output_bam.write(read) # done processing block of reads return
def main(args, outs): chunk_start = args.chunk_start chunk_end = args.chunk_end bam_in = tk_bam.create_bam_infile(args.input) reads = tk_bam.read_bam_chunk(bam_in, (chunk_start, chunk_end)) pgs = [ tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_phasing"), tk_bam.make_terminal_pg_header(martian.get_pipelines_version()) ] # dont duplicate header if already there, this is for developer testing purposes PG = bam_in.header['PG'] for item in PG: if item['ID'] == "attach_phasing": pgs = [] bam_out, _ = tk_bam.create_bam_outfile(outs.phased_possorted_bam, None, None, template=bam_in, pgs=pgs) # File with contig phasing information if args.fragment_phasing is not None: frag_phasing = tk_tabix.create_tabix_infile(args.fragment_phasing) else: frag_phasing = None if args.fragments is not None: # Fragments file for global molecule id frag_id_reader = tk_hdf5.DataFrameReader(args.fragments) else: frag_id_reader = None # Phasing data ph_db = None ph_db_chrom = None ph_db_start = None ph_db_end = None # Fragment data - for global molecule id fr_db = None fr_db_chrom = None fr_db_start = None fr_db_end = None total_reads = 0 phased_reads = 0 molecule_tagged_reads = 0 for r in reads: chrom = bam_in.references[r.tid] pos = r.pos bc = tk_io.get_read_barcode(r) total_reads += 1 tags = r.tags # Strip out RX and QX tags #strip_tags = [RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG] # Actually don't strip strip_tags = [] tags = [(tag, value) for (tag, value) in tags if (tag not in strip_tags)] # Fetch from the fragments file to get records that should cover many future reads # fragment phasing file may not exist in ALIGNER only pipeline - may need to skip if frag_phasing is not None: if ph_db is None or chrom != ph_db_chrom or pos < ph_db_start or pos > ph_db_end: ph_db, (ph_db_chrom, ph_db_start, ph_db_end) = get_frag_phasing_db(frag_phasing, chrom, pos, window=WINDOW_SIZE) if bc is not None and ph_db.has_key(bc): frags = ph_db[bc] # See if we having phasing for this fragment valid_phasing = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(valid_phasing) < 2) if len(valid_phasing) == 1: phased_reads += 1 read_phasing = valid_phasing[0] tags.append((PHASE_SET_BAM_TAG, read_phasing['ps'])) tags.append((HAPLOTYPE_BAM_TAG, read_phasing['hap'])) tags.append((PHASING_CONF_BAM_TAG, read_phasing['pc'])) if frag_id_reader is not None: # Fetch from the fragments file to get records that should cover many future reads if fr_db is None or chrom != fr_db_chrom or pos < fr_db_start or pos > fr_db_end: fr_db, (fr_db_chrom, fr_db_start, fr_db_end) = get_molecule_id_db(frag_id_reader, chrom, pos, window=WINDOW_SIZE) if bc is not None and fr_db.has_key(bc): frags = fr_db[bc] # See if we having phasing for this fragment molecule_ids = [x for x in frags if x['start'] <= r.pos and x['end'] > r.pos] assert(len(molecule_ids) < 2) if len(molecule_ids) == 1: molecule_tagged_reads += 1 molecule_id = molecule_ids[0] tags.append((MOLECULE_ID_BAM_TAG, molecule_id['molecule_id'])) r.tags = tags bam_out.write(r) bam_out.close() outs.total_reads = total_reads outs.phased_reads = phased_reads outs.molecule_tagged_reads = molecule_tagged_reads