def lowercase_below_qual_threshold(seq, qual, threshold): ''' Returns seq with characters made lowercase at any position for which qual is below threshold. ''' seq = list(seq) qual = fastq.decode_sanger(qual) for p, (s, q) in enumerate(zip(seq, qual)): if q <= threshold: seq[p] = s.lower() return ''.join(seq)
def align_reads( target_fasta_fn, reads, bam_fn, min_path_length=15, error_fn='/dev/null', alignment_type='overlap', ): ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing alignments in bam_fn and yielding unaligned reads. ''' targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)} target_names = sorted(targets) target_lengths = [len(targets[n]) for n in target_names] alignment_sorter = sam.AlignmentSorter( target_names, target_lengths, bam_fn, ) statistics = Counter() with alignment_sorter: for original_read in reads: statistics['input'] += 1 alignments = [] rc_read = fastq.Read( original_read.name, utilities.reverse_complement(original_read.seq), original_read.qual[::-1], ) for read, is_reverse in ([original_read, False], [rc_read, True]): qual = fastq.decode_sanger(read.qual) for target_name, target_seq in targets.iteritems(): alignment = generate_alignments(read.seq, target_seq, alignment_type)[0] path = alignment['path'] if len(path) >= min_path_length and alignment['score'] / ( 2. * len(path)) > 0.8: aligned_segment = pysam.AlignedSegment() aligned_segment.seq = read.seq aligned_segment.query_qualities = qual aligned_segment.is_reverse = is_reverse char_pairs = make_char_pairs(path, read.seq, target_seq) cigar = sam.aligned_pairs_to_cigar(char_pairs) clip_from_start = first_query_index(path) if clip_from_start > 0: cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start) ] + cigar clip_from_end = len( read.seq) - 1 - last_query_index(path) if clip_from_end > 0: cigar = cigar + [ (sam.BAM_CSOFT_CLIP, clip_from_end) ] aligned_segment.cigar = cigar read_aligned, ref_aligned = zip(*char_pairs) md = sam.alignment_to_MD_string( ref_aligned, read_aligned) aligned_segment.set_tag('MD', md) aligned_segment.set_tag('AS', alignment['score']) aligned_segment.tid = alignment_sorter.get_tid( target_name) aligned_segment.query_name = read.name aligned_segment.next_reference_id = -1 aligned_segment.reference_start = first_target_index( path) alignments.append(aligned_segment) if alignments: statistics['aligned'] += 1 sorted_alignments = sorted(alignments, key=lambda m: m.get_tag('AS'), reverse=True) grouped = utilities.group_by(sorted_alignments, key=lambda m: m.get_tag('AS')) _, highest_group = grouped.next() primary_already_assigned = False for alignment in highest_group: if len(highest_group) == 1: alignment.mapping_quality = 2 else: alignment.mapping_quality = 1 if not primary_already_assigned: primary_already_assigned = True else: alignment.is_secondary = True alignment_sorter.write(alignment) else: statistics['unaligned'] += 1 yield read with open(error_fn, 'w') as error_fh: for key in ['input', 'aligned', 'unaligned']: error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = ( mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning( mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping
def combine_paired_mappings(R1_mapping, R2_mapping, verbose=False): ''' Takes two pysam mappings representing opposite ends of a fragment and combines them into one mapping, (ab)using BAM_CREF_SKIP to bridge the gap (if any) between them. ''' R1_strand = sam.get_strand(R1_mapping) if R1_strand == '+': left_mapping, right_mapping = R1_mapping, R2_mapping elif R1_strand == '-': left_mapping, right_mapping = R2_mapping, R1_mapping left_md = dict(left_mapping.tags)['MD'] right_md = dict(right_mapping.tags)['MD'] right_aligned_pairs = sam.cigar_to_aligned_pairs( right_mapping.cigar, right_mapping.reference_start) right_after_overlap_pair_index = len(right_aligned_pairs) for i, (read, ref) in enumerate(right_aligned_pairs): if ref != None and ref >= left_mapping.aend: right_after_overlap_pair_index = i break right_overlap_pairs = right_aligned_pairs[:right_after_overlap_pair_index] right_after_overlap_pairs = right_aligned_pairs[ right_after_overlap_pair_index:] right_reads_after = [ read for read, ref in right_after_overlap_pairs if read != None and read != 'N' ] right_refs_after = [ ref for read, ref in right_after_overlap_pairs if ref != None ] right_overlap_cigar = sam.aligned_pairs_to_cigar(right_overlap_pairs) right_after_overlap_cigar = sam.aligned_pairs_to_cigar( right_after_overlap_pairs) right_after_overlap_md = sam.truncate_md_string_from_beginning( right_md, len(right_refs_after)) right_after_overlap_read_start = len( right_mapping.seq) - len(right_reads_after) right_overlap_seq = right_mapping.seq[:right_after_overlap_read_start] right_overlap_qual = right_mapping.qual[:right_after_overlap_read_start] right_after_overlap_seq = right_mapping.seq[ right_after_overlap_read_start:] right_after_overlap_qual = right_mapping.qual[ right_after_overlap_read_start:] left_aligned_pairs = sam.cigar_to_aligned_pairs( left_mapping.cigar, left_mapping.reference_start) left_before_overlap_pair_index = -1 for i, (read, ref) in list(enumerate(left_aligned_pairs))[::-1]: if ref != None and ref < right_mapping.pos: left_before_overlap_pair_index = i break left_overlap_pairs = left_aligned_pairs[left_before_overlap_pair_index + 1:] left_before_overlap_pairs = left_aligned_pairs[: left_before_overlap_pair_index + 1] left_reads_before = [ read for read, ref in left_before_overlap_pairs if read != None and read != 'N' ] left_refs_before = [ ref for read, ref in left_before_overlap_pairs if ref != None ] left_overlap_cigar = sam.aligned_pairs_to_cigar(left_overlap_pairs) left_before_overlap_cigar = sam.aligned_pairs_to_cigar( left_before_overlap_pairs) left_before_overlap_md = sam.truncate_md_string_up_to( left_md, len(left_refs_before)) left_overlap_read_start = len(left_reads_before) left_overlap_seq = left_mapping.seq[left_overlap_read_start:] left_overlap_qual = left_mapping.qual[left_overlap_read_start:] left_before_overlap_seq = left_mapping.seq[:left_overlap_read_start] left_before_overlap_qual = left_mapping.qual[:left_overlap_read_start] if left_overlap_pairs or right_overlap_pairs: gap_length = 0 left_has_splicing = sam.contains_splicing(left_mapping) right_has_splicing = sam.contains_splicing(right_mapping) if left_overlap_cigar == right_overlap_cigar: # If the two mappings agree about the location of indels in their overlap, # use the seq from the mapping with the higher average quality in the # overlap. left_mean_qual = np.mean(fastq.decode_sanger(left_overlap_qual)) right_mean_qual = np.mean(fastq.decode_sanger(right_overlap_qual)) if left_mean_qual > right_mean_qual: use_overlap_from = 'left' else: use_overlap_from = 'right' elif left_has_splicing != right_has_splicing: # A temporary(?) heuristic - if one read has splicing and the other # doesn't, use the overlap from the one with splicing under the # assumption that the other just has a few bases overhanging the # splice junction. if left_has_splicing: use_overlap_from = 'left' else: use_overlap_from = 'right' else: # If the two mappings disagree about the location of indels in their overlap, # we need a heuristic for picking which mapping we believe reflects the # true structure of the input fragment. The most innocuous explanation # is that a 'true' indel happened to lie close to the edge of one of the # mappings. A more problematic situation is a 'false' indel (that is, # produced during cluster generation or sequencing-by-synthesis, NOT # template production). Our strategy is: realign the overlapping part of # left mapping starting from the left edge of the overlap according to the # cigar of the right mapping and realign the overlapping part of the right # mapping starting from the right edge of the overlap according to the cigar # of the left mapping. Count the number of mismatches produced by each. # If the left overlap can accomodate the right cigar with fewer mismatches, # use the right cigar and seq. If the right overlap can accomodate the left # cigar with fewer mismatches, use the left cigar and seq. # The leftmost aligned_pair from the right mapping is guaranteed by the # mapping process to not involve a gap. _, overlap_ref_start = right_overlap_pairs[0] # Similarly, the rightmost aligned_pair from the left mapping can't be a # gap. _, overlap_ref_end = left_overlap_pairs[-1] realigned_left_cigar = sam.truncate_cigar_blocks_up_to( right_mapping.cigar, len(left_overlap_seq)) realigned_right_cigar = sam.truncate_cigar_blocks_from_beginning( left_mapping.cigar, len(right_overlap_seq)) ref_dict = sam.merge_ref_dicts( sam.ref_dict_from_mapping(left_mapping), sam.ref_dict_from_mapping(right_mapping), ) try: left_using_right_mismatches = realigned_mismatches( left_overlap_seq, overlap_ref_start, realigned_left_cigar, ref_dict) right_using_left_mismatches = realigned_mismatches_backwards( right_overlap_seq, overlap_ref_end, realigned_right_cigar, ref_dict) except ValueError: print left_mapping print right_mapping raise if verbose: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) if len(left_using_right_mismatches) < len( right_using_left_mismatches): use_overlap_from = 'right' elif len(right_using_left_mismatches) < len( left_using_right_mismatches): use_overlap_from = 'left' else: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) logging.info('ambiguous disagreement') return False else: gap_length = right_mapping.pos - left_mapping.aend # It doesn't matter what use_overlap_from is set to; there is no overlap use_overlap_from = 'left' combined_mapping = pysam.AlignedRead() combined_mapping.qname = left_mapping.qname combined_mapping.tid = left_mapping.tid combined_mapping.mapq = min(left_mapping.mapq, right_mapping.mapq) combined_mapping.rnext = -1 combined_mapping.pnext = -1 combined_mapping.pos = left_mapping.pos if R1_strand == '-': combined_mapping.is_reverse = True gap_cigar = [(sam.BAM_CREF_SKIP, gap_length)] if use_overlap_from == 'left': combined_mapping.seq = left_mapping.seq + right_after_overlap_seq combined_mapping.qual = left_mapping.qual + right_after_overlap_qual combined_mapping.cigar = left_mapping.cigar + gap_cigar + right_after_overlap_cigar combined_md = sam.combine_md_strings(left_md, right_after_overlap_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = right_overlap_seq overlap_qual_tag = right_overlap_qual elif use_overlap_from == 'right': combined_mapping.seq = left_before_overlap_seq + right_mapping.seq combined_mapping.qual = left_before_overlap_qual + right_mapping.qual combined_mapping.cigar = left_before_overlap_cigar + gap_cigar + right_mapping.cigar combined_md = sam.combine_md_strings(left_before_overlap_md, right_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = left_overlap_seq overlap_qual_tag = left_overlap_qual if len(overlap_seq_tag) > 0: # Having empty tags causes problems, so don't create them. combined_mapping.setTag('Xs', overlap_seq_tag) combined_mapping.setTag('Xq', overlap_qual_tag) combined_mapping.setTag('Xw', use_overlap_from) return combined_mapping
def collapseUMIs(reads, readThres, outfile): # collpase reads assuming cellBC and UMI are true # identifies consensus sequence and reports as sequence for each cellBC-UMI combination numReadsQualFilt = 0 UMIGrps = {} for r in reads: # itertools.islice(reads,10000): avgQ50 = np.mean(fastq.decode_sanger(r.qual[0:50])) if avgQ50 < 20: numReadsQualFilt = numReadsQualFilt + 1 continue n = r.name.split('_') cellGroup = n[1] + "_" + n[2] readcount = int(n[3]) if cellGroup in UMIGrps: [seqs, counts] = UMIGrps[cellGroup] seqs.append(r.seq) counts.append(readcount) else: UMIGrps[cellGroup] = [[r.seq], [readcount]] print("# of cell-UMI groups: " + str(len(UMIGrps)) + " (includes <" + str(readThres) + ")") read_dist = [] for u in UMIGrps: [seqs, counts] = UMIGrps[u] read_dist.append(sum(counts)) h = plt.figure(figsize=(14, 10)) ax = plt.hist(read_dist, log=True) plt.ylabel("Frequency") plt.xlabel("Number of Reads") plt.title("Reads Per UMI") plt.savefig("collapsedUMIs_reads_per_umi.init.png") plt.close() readThresh = np.percentile(read_dist, 99) / 10 print("Filtering out UMIs with less than " + str(readThresh) + " reads") fh = open(outfile, 'w') fh.write("cellBC\tUMI\treadCount\tconsensusSeq\n") numBelowReadThres = 0 numMaj = 0 numCon = 0 numSingles = 0 counter = 1 for k in UMIGrps: # each UMI group consists of reads from the same molecule [seqs, counts] = UMIGrps[k] grpSize = sum(counts) if grpSize < readThres: # too few reads to include numBelowReadThres = numBelowReadThres + 1 continue n = k.split("_") if len(seqs) == 1: # trivial case added 9/11/2017 numSingles = numSingles + 1 fh.write("\t".join([str(n[0]), str(n[1]), str(counts[0]), seqs[0]]) + "\n") else: # # Update 9/1/2017: try to improve speeds by increasing the number of same reads to feed # into majority instead of consensus finding # trim to length of 25th percentile read ranked by length # s1 = pd.DataFrame({"seq": seqs, "readCount": counts}) s1["seqLen"] = s1["seq"].str.len() s1 = s1.sort_values("seqLen").reset_index( drop=True) # sorts reads by length in ascending totalReads = s1["readCount"].sum() cReads = s1["readCount"].cumsum() # cumulative rPctile = 0.3 * totalReads # 30th percentile rPctileIndex = cReads[cReads >= rPctile].index[ 0] # index of seq length sLen = s1.loc[rPctileIndex, "seqLen"] s1["seq"] = s1["seq"].str[0:sLen] s2 = s1.groupby(["seq"]).agg({ "readCount": np.sum }).sort_values("readCount", ascending=False) # indexed by seq grpProp = s2.loc[s2.index[0], "readCount"] / float(totalReads) if grpProp > .50: consensusSeq = s2.index[0] numMaj = numMaj + 1 else: consensusSeq = get_consensus(s2.index.tolist(), s2["readCount"].tolist()) numCon = numCon + 1 # print Entry fh.write("\t".join([ str(n[0]), str(n[1]), str(totalReads), consensusSeq ]) + "\n") counter = counter + 1 if counter % 1000 == 0: print(str(counter) + " groups processed...") fh.close() print("# of cell-UMI groups = " + str(len(UMIGrps))) print("# reads qual <20 (filtered) = " + str(numReadsQualFilt)) print("# grps w/ reads<" + str(readThres) + " = " + str(numBelowReadThres)) print("# grps singles = " + str(numSingles)) print("# grps >0.5 = " + str(numMaj)) print("# grps concensus = " + str(numCon))
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = (mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping