def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len_initial=100, reads_per_alignment=31, accept_holes=False, store_allele_counts=False): '''Build a consensus from premapped and divided reads''' if VERBOSE: print 'Build consensus' import numpy as np import pysam from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.mapping import align_muscle # Three steps: # 1. collect reads uniformly across the fragment # 2. make local consensi # 3. join into fragmentwide consensus consensus = None consensi_local = [] if store_allele_counts: allcounts_local = [] pos_ref = 0 block_len = block_len_initial with pysam.Samfile(bamfilename, 'rb') as bamfile: # Initial block if VERBOSE >= 2: print 'Block n', len(consensi_local) + 1, for pos_first_block in xrange(len_reference): bamfile.reset() # The first block has to make a consensus for the FIRST base, this needs # at least ONE read starting exactly at the first position. Otherwise, # the same is repeated for position 2, and so on. reads = [ read for read in bamfile if (read.is_proper_pair) and (read.pos == pos_first_block) ] if not len(reads): continue np.random.shuffle(reads) reads = reads[:n_reads_per_ali] seqs = [ SeqRecord(Seq(read.seq[:block_len], ambiguous_dna), id=read.qname) for read in reads ] cons_local = build_local_consensus( seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts) if store_allele_counts: (cons_local, allcount_local) = cons_local allcounts_local.append(allcount_local) consensi_local.append(cons_local) pos_ref += (block_len_initial // 2) * (1 + pos_first_block // (block_len_initial // 2)) if VERBOSE >= 2: print 'pos', pos_first_block, 'to', pos_first_block + block_len, 'block len', block_len break # Start consensus if len(consensi_local) == 1: consensus = [consensi_local[0]] if store_allele_counts: allcounts = [allcounts_local[0]] # Divide reads by block (more efficient than scrolling the file every time) # FIXME: extract random subsample, assign to blocks, and only complete the missing blocks! reads_by_block = [[] for n_block in xrange((len_reference - pos_ref) // (block_len_initial // 2))] bamfile.reset() for read in bamfile: if not read.is_proper_pair: continue pos_ref_tmp = pos_ref n_block = 1 while (pos_ref_tmp < len_reference): block_len_tmp = min(block_len, len_reference - pos_ref) read_start = read.pos read_end = read.pos + sum( bl for (bt, bl) in read.cigar if bt in (0, 2)) if (pos_ref_tmp - 100 < read_start <= pos_ref_tmp) and \ (read_end >= pos_ref_tmp + block_len_tmp): reads_by_block[n_block - 1].append(read) break pos_ref_tmp += block_len_initial // 2 n_block += 1 # Stack local consensi on top of the first one n_block = 1 while (pos_ref < len_reference): block_len = min(block_len, len_reference - pos_ref) if block_len < block_len_initial // 2: break if VERBOSE >= 2: print 'Block n', len( consensi_local ) + 1, 'pos', pos_ref, 'to', pos_ref + block_len, 'block len', block_len # Get reads that cover the whole block reads = reads_by_block[n_block - 1] n_block += 1 #FIXME #if n_block >= 2: # print pos_ref, pos_ref + block_len # import ipdb; ipdb.set_trace() # Internal coverage holes are not tolerated, but the last block # is allowed to be missing. However, we should try to squeeze out # all the bases by rescanning the reads a last time with less strict # criteria: if it has even one base more than what we have, add it if len(reads): full_cover = True else: full_cover = False bamfile.reset() reads = [] for read in bamfile: if not read.is_proper_pair: continue read_start = read.pos read_end = read.pos + sum( bl for (bt, bl) in read.cigar if bt in (0, 2)) if (read_start <= pos_ref) and ( read_end > pos_ref + block_len_initial // 2): reads.append(read) if not len(reads): if pos_ref + block_len < len_reference: if VERBOSE >= 2: print 'WARNING: consensus looks interrupted in mid-way' break # Take a random subsample of reads. If it's a problematic block, not # fully covered, take more reads than usual if full_cover: np.random.shuffle(reads) reads = reads[:n_reads_per_ali] else: # Trim all, then take longest pass # Trim reads from the left to start all at the block start # NOTE: reads have been selected to start @ or before the block start! seqs = [] for read in reads: pos_reft = read.pos # Find start of the block in the read start_found = False pos_read_start = 0 pos_read_end = 0 for (bt, bl) in read.cigar: if bt == 1: if not start_found: pos_read_start += bl pos_read_end += bl elif bt == 2: if (not start_found) and (pos_reft + bl > pos_ref): start_found = True if pos_reft + bl > pos_ref + block_len: break pos_reft += bl else: if (not start_found) and (pos_reft + bl > pos_ref): pos_read_start += pos_ref - pos_reft start_found = True if pos_reft + bl > pos_ref + block_len: pos_read_end += pos_ref + block_len - pos_reft break if not start_found: pos_read_start += bl pos_read_end += bl pos_reft += bl seq = SeqRecord(Seq(read.seq[pos_read_start:pos_read_end], ambiguous_dna), id=read.qname) seqs.append(seq) # If it's a problematic block, take longest reads if not full_cover: seqs.sort(key=len, reverse=True) seqs = seqs[:n_reads_per_ali] #FIXME #if n_block >= 2: # print pos_ref, pos_ref + block_len # import ipdb; ipdb.set_trace() # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- cons_local = build_local_consensus( seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts, full_cover=full_cover) if store_allele_counts: (cons_local, allcount_local) = cons_local allcounts_local.append(allcount_local) consensi_local.append(cons_local) pos_ref += block_len_initial // 2 # Join block <-- to the stack, like this: # --------------------------- # -------------------- if consensus is None: consensus = [consensi_local[0]] if store_allele_counts: allcounts = [allcounts_local[0]] else: cons = cons_local seed = consensus[-1][-20:] sl = len(seed) pos_start = cons.find(seed) # Allow imperfect matches if pos_start == -1: consm = np.fromstring(cons, 'S1') seedm = np.fromstring(seed, 'S1') n_matches = [(consm[i:i + sl] == seedm).sum() for i in xrange(len(cons) + 1 - len(seed))] pos_start = np.argmax(n_matches) # Try to only add non-bogus stuff if n_matches[pos_start] < 0.66 * sl: pos_start = -1 if VERBOSE >= 4: print 'Block n.', len( consensi_local ) + ': cannot stack to previous one!' if pos_start != -1: consensus.append(cons[pos_start + sl:]) if store_allele_counts: allcounts.append(allcounts_local[-1][:, pos_start + sl:]) elif accept_holes: consensus.append('N' * 10) consensus.append(cons) if store_allele_counts: tmpall = np.zeros((allcounts_local[-1].shape[0], 10), int) tmpall[-1] = 1 allcounts.append(tmpall) allcounts.append(allcounts_local[-1]) if consensus is None: raise ValueError('Consensus is still None: unable to build!') consensus = ''.join(consensus) if store_allele_counts: allcounts = np.concatenate(allcounts, axis=1) return (consensus, allcounts) return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len_initial=100, reads_per_alignment=31, accept_holes=False, store_allele_counts=False): '''Build a consensus from premapped and divided reads''' if VERBOSE: print 'Build consensus' import numpy as np import pysam from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.mapping import align_muscle # Three steps: # 1. collect reads uniformly across the fragment # 2. make local consensi # 3. join into fragmentwide consensus consensus = None consensi_local = [] if store_allele_counts: allcounts_local = [] pos_ref = 0 block_len = block_len_initial with pysam.Samfile(bamfilename, 'rb') as bamfile: # Initial block if VERBOSE >= 2: print 'Block n', len(consensi_local) + 1, for pos_first_block in xrange(len_reference): bamfile.reset() # The first block has to make a consensus for the FIRST base, this needs # at least ONE read starting exactly at the first position. Otherwise, # the same is repeated for position 2, and so on. reads = [read for read in bamfile if (read.is_proper_pair) and (read.pos == pos_first_block)] if not len(reads): continue np.random.shuffle(reads) reads = reads[:n_reads_per_ali] seqs = [SeqRecord(Seq(read.seq[:block_len], ambiguous_dna), id=read.qname) for read in reads] cons_local = build_local_consensus(seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts) if store_allele_counts: (cons_local, allcount_local) = cons_local allcounts_local.append(allcount_local) consensi_local.append(cons_local) pos_ref += (block_len_initial // 2) * (1 + pos_first_block // (block_len_initial // 2)) if VERBOSE >= 2: print 'pos', pos_first_block, 'to', pos_first_block + block_len, 'block len', block_len break # Start consensus if len(consensi_local) == 1: consensus = [consensi_local[0]] if store_allele_counts: allcounts = [allcounts_local[0]] # Divide reads by block (more efficient than scrolling the file every time) # FIXME: extract random subsample, assign to blocks, and only complete the missing blocks! reads_by_block = [[] for n_block in xrange((len_reference - pos_ref) // (block_len_initial // 2))] bamfile.reset() for read in bamfile: if not read.is_proper_pair: continue pos_ref_tmp = pos_ref n_block = 1 while (pos_ref_tmp < len_reference): block_len_tmp = min(block_len, len_reference - pos_ref) read_start = read.pos read_end = read.pos + sum(bl for (bt, bl) in read.cigar if bt in (0, 2)) if (pos_ref_tmp - 100 < read_start <= pos_ref_tmp) and \ (read_end >= pos_ref_tmp + block_len_tmp): reads_by_block[n_block - 1].append(read) break pos_ref_tmp += block_len_initial // 2 n_block += 1 # Stack local consensi on top of the first one n_block = 1 while (pos_ref < len_reference): block_len = min(block_len, len_reference - pos_ref) if block_len < block_len_initial // 2: break if VERBOSE >= 2: print 'Block n', len(consensi_local) + 1, 'pos', pos_ref, 'to', pos_ref + block_len, 'block len', block_len # Get reads that cover the whole block reads = reads_by_block[n_block - 1] n_block += 1 #FIXME #if n_block >= 2: # print pos_ref, pos_ref + block_len # import ipdb; ipdb.set_trace() # Internal coverage holes are not tolerated, but the last block # is allowed to be missing. However, we should try to squeeze out # all the bases by rescanning the reads a last time with less strict # criteria: if it has even one base more than what we have, add it if len(reads): full_cover= True else: full_cover= False bamfile.reset() reads = [] for read in bamfile: if not read.is_proper_pair: continue read_start = read.pos read_end = read.pos + sum(bl for (bt, bl) in read.cigar if bt in (0, 2)) if (read_start <= pos_ref) and (read_end > pos_ref + block_len_initial // 2): reads.append(read) if not len(reads): if pos_ref + block_len < len_reference: if VERBOSE >= 2: print 'WARNING: consensus looks interrupted in mid-way' break # Take a random subsample of reads. If it's a problematic block, not # fully covered, take more reads than usual if full_cover: np.random.shuffle(reads) reads = reads[:n_reads_per_ali] else: # Trim all, then take longest pass # Trim reads from the left to start all at the block start # NOTE: reads have been selected to start @ or before the block start! seqs = [] for read in reads: pos_reft = read.pos # Find start of the block in the read start_found = False pos_read_start = 0 pos_read_end = 0 for (bt, bl) in read.cigar: if bt == 1: if not start_found: pos_read_start += bl pos_read_end += bl elif bt == 2: if (not start_found) and (pos_reft + bl > pos_ref): start_found = True if pos_reft + bl > pos_ref + block_len: break pos_reft += bl else: if (not start_found) and (pos_reft + bl > pos_ref): pos_read_start += pos_ref - pos_reft start_found = True if pos_reft + bl > pos_ref + block_len: pos_read_end += pos_ref + block_len - pos_reft break if not start_found: pos_read_start += bl pos_read_end += bl pos_reft += bl seq = SeqRecord(Seq(read.seq[pos_read_start: pos_read_end], ambiguous_dna), id=read.qname) seqs.append(seq) # If it's a problematic block, take longest reads if not full_cover: seqs.sort(key=len, reverse=True) seqs = seqs[:n_reads_per_ali] #FIXME #if n_block >= 2: # print pos_ref, pos_ref + block_len # import ipdb; ipdb.set_trace() # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- cons_local = build_local_consensus(seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts, full_cover=full_cover) if store_allele_counts: (cons_local, allcount_local) = cons_local allcounts_local.append(allcount_local) consensi_local.append(cons_local) pos_ref += block_len_initial // 2 # Join block <-- to the stack, like this: # --------------------------- # -------------------- if consensus is None: consensus = [consensi_local[0]] if store_allele_counts: allcounts = [allcounts_local[0]] else: cons = cons_local seed = consensus[-1][-20:] sl = len(seed) pos_start = cons.find(seed) # Allow imperfect matches if pos_start == -1: consm = np.fromstring(cons, 'S1') seedm = np.fromstring(seed, 'S1') n_matches = [(consm[i: i + sl] == seedm).sum() for i in xrange(len(cons) + 1 - len(seed))] pos_start = np.argmax(n_matches) # Try to only add non-bogus stuff if n_matches[pos_start] < 0.66 * sl: pos_start = -1 if VERBOSE >= 4: print 'Block n.', len(consensi_local)+': cannot stack to previous one!' if pos_start != -1: consensus.append(cons[pos_start + sl:]) if store_allele_counts: allcounts.append(allcounts_local[-1][:, pos_start + sl:]) elif accept_holes: consensus.append('N' * 10) consensus.append(cons) if store_allele_counts: tmpall = np.zeros((allcounts_local[-1].shape[0], 10), int) tmpall[-1] = 1 allcounts.append(tmpall) allcounts.append(allcounts_local[-1]) if consensus is None: raise ValueError('Consensus is still None: unable to build!') consensus = ''.join(consensus) if store_allele_counts: allcounts = np.concatenate(allcounts, axis=1) return (consensus, allcounts) return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len=100, reads_per_alignment=31, deltamax=60): '''Build a consensus from mapped filtered reads''' if VERBOSE: print 'Build consensus' from operator import itemgetter import numpy as np import pysam from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.miseq import alpha from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.sequence import build_local_consensus with pysam.Samfile(bamfilename, 'rb') as bamfile: if VERBOSE >= 3: from hivwholeseq.utils.mapping import get_number_reads_open print 'The bamfile has', get_number_reads_open(bamfile), 'reads.' # Get first block covered, even if partially, and record where each read started if VERBOSE >= 2: print 'First block' block_len = block_len seqs = [] n_block = 0 while not seqs: start_block = n_block * (block_len // 2) for read in bamfile: if read.pos <= start_block: seqs.append((read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos])) bamfile.reset() n_block += 1 # If there are too many reads, take the reads that start earliest if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=itemgetter(0)) seqs = seqs[:reads_per_alignment] seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, (pos, s) in enumerate(seqs)] consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) # Block, by block, make local alignment and join to previous consensus # There are two ways of finishing the loop: # 1. if we cover all the way to the end of the reference, good # 2. if we find no reads fully covering a block BEFORE that, add a final block while start_block < len_reference: edges = (start_block, min(len_reference, start_block + block_len)) if VERBOSE >= 2: print 'block n.', n_block, 'region:', edges seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE) # If we do not find reads that fully cover, consider it the end of # the consensus, only the final block is missing if not seqs: break elif len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs = seqs[:reads_per_alignment] # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs)] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True) # Join to the rest of the consensus, like this: # --------------------------- # -------------------- consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) start_block += 2 * block_len // 3 n_block += 1 # If we cover the whole reference, good else: return consensus if VERBOSE >= 2: print 'final block' # If we broke out of the while, a final block is needed seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE) # Sort reads by length if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=len, reverse=True) seqs = seqs[:reads_per_alignment] # Complete with N, approximately sl = len(seqs[0]) seqs = [s+('N' * (sl - len(s))) for s in seqs] seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs)] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0, block_len=100, reads_per_alignment=31, deltamax=60): '''Build a consensus from mapped filtered reads''' if VERBOSE: print 'Build consensus' from operator import itemgetter import numpy as np import pysam from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet.IUPAC import ambiguous_dna from hivwholeseq.utils.miseq import alpha from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.sequence import build_local_consensus with pysam.Samfile(bamfilename, 'rb') as bamfile: if VERBOSE >= 3: from hivwholeseq.utils.mapping import get_number_reads_open print 'The bamfile has', get_number_reads_open(bamfile), 'reads.' # Get first block covered, even if partially, and record where each read started if VERBOSE >= 2: print 'First block' block_len = block_len seqs = [] n_block = 0 while not seqs: start_block = n_block * (block_len // 2) for read in bamfile: if read.pos <= start_block: seqs.append( (read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos])) bamfile.reset() n_block += 1 # If there are too many reads, take the reads that start earliest if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=itemgetter(0)) seqs = seqs[:reads_per_alignment] seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, (pos, s) in enumerate(seqs) ] consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) # Block, by block, make local alignment and join to previous consensus # There are two ways of finishing the loop: # 1. if we cover all the way to the end of the reference, good # 2. if we find no reads fully covering a block BEFORE that, add a final block while start_block < len_reference: edges = (start_block, min(len_reference, start_block + block_len)) if VERBOSE >= 2: print 'block n.', n_block, 'region:', edges seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE) # If we do not find reads that fully cover, consider it the end of # the consensus, only the final block is missing if not seqs: break elif len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs = seqs[:reads_per_alignment] # Make local consensus using a multiple sequence alignment # -------------- # ----- ------ # -------- --- #--------------- seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs) ] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True) # Join to the rest of the consensus, like this: # --------------------------- # -------------------- consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) start_block += 2 * block_len // 3 n_block += 1 # If we cover the whole reference, good else: return consensus if VERBOSE >= 2: print 'final block' # If we broke out of the while, a final block is needed seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE) # Sort reads by length if len(seqs) > reads_per_alignment: np.random.shuffle(seqs) seqs.sort(key=len, reverse=True) seqs = seqs[:reads_per_alignment] # Complete with N, approximately sl = len(seqs[0]) seqs = [s + ('N' * (sl - len(s))) for s in seqs] seqrecs = [ SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i)) for i, s in enumerate(seqs) ] cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False) consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE, deltamax=deltamax) return consensus