def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs): '''Build the coordinate map Parameters **kwargs: passed to alignment function (e.g. alignment penalties) ''' from seqanpy import align_global (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen, **kwargs) patseq_start = len(ali2) - len(ali2.lstrip('-')) patseq_end = len(ali2.rstrip('-')) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1[patseq_start: patseq_end], ali2[patseq_start: patseq_end]], name1=refseq.name, name2=patseq.name) # Bijective map mapbi = [] pos_ref = patseq_start pos_ini = 0 for col in xrange(patseq_start, patseq_end): nuc_ref = ali1[col] nuc_ini = ali2[col] if (nuc_ref != '-') and (nuc_ini != '-'): mapbi.append((pos_ref, pos_ini)) pos_ref += 1 pos_ini += 1 elif (nuc_ref != '-'): pos_ref += 1 elif (nuc_ini != '-'): pos_ini += 1 return mapbi
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0): '''Merge the allele counts of all fragments Note: we do not require full coverage of all fragments, the missing ones will just have zero counts. Sometimes, cherry-picking the data fragment by fragment might be a better choice. ''' from hivwholeseq.utils.miseq import alpha, read_types from seqanpy import align_overlap ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int) pos_ref = 1000 for (fr, ref, acsi) in acs: # Find the coordinates (score, ali1, ali2) = align_overlap(ref_genomewide[pos_ref - 1000:], ref, #score_gapopen=-20, ) fr_start = len(ali2) - len(ali2.lstrip('-')) fr_end = len(ali2.rstrip('-')) if VERBOSE: print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end # Scan the alignment pos_ref = pos_ref - 1000 + fr_start fr_start_ref = pos_ref fr_end_ref = pos_ref + fr_end - fr_start pos_fr = 0 for pos_ali in xrange(fr_start, fr_end): # Gap in genomewise, ignore position if ali1[pos_ali] == '-': pos_fr += 1 continue # Gap in fragment, ignore FIXME: probably we should put deletions elif ali2[pos_ali] == '-': pos_ref += 1 continue # Add the counts # NOTE: all fragments are treated the same, even in case of coverage # differences of orders of magnitude. This means, larger coverage # always wins. Maybe we want to implement this somewhat differently ac[:, :, pos_ref] += acsi[:, :, pos_fr] pos_fr += 1 pos_ref += 1 if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali cons = alpha[ac.sum(axis=0).argmax(axis=0)] pretty_print_pairwise_ali((ali1[fr_start: fr_end], cons[fr_start: fr_end]), name1='gw', name2=fr, width=100) return ac
def on_click(event): '''Print sequence on click''' mouseevent = event.mouseevent artist = event.artist i_clicked = int(artist.get_label()) (score, ali1, ali2) = align_global(seq0, seqs[i_clicked], score_gapopen=-20) pretty_print_pairwise_ali((ali1, ali2), name1='cons0', name2='clicked', width=120)
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60): '''Join a new block to an extant consensus''' import numpy as np from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block') # In very rare occasions (coverage holes), the second sequence is actually # shorter than the first, then we do not need to glue it in if ali2[-1] == '-': if VERBOSE >= 2: print 'WARNING: the old block is longer than the new one (maybe low coverage)' return consensus end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) scoremax = 3 * (end1 - start2) delta = scoremax - score if delta > deltamax: raise ValueError( 'Too many mismatches in neighbouring local consensi! (' + str(delta) + ', max ' + str(deltamax) + ')') consensus = (ali1[:start2] + ali2[start2:]).replace('-', '') return consensus
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60): '''Join a new block to an extant consensus''' import numpy as np from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block') # In very rare occasions (coverage holes), the second sequence is actually # shorter than the first, then we do not need to glue it in if ali2[-1] == '-': if VERBOSE >= 2: print 'WARNING: the old block is longer than the new one (maybe low coverage)' return consensus end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) scoremax = 3 * (end1 - start2) delta = scoremax - score if delta > deltamax: raise ValueError('Too many mismatches in neighbouring local consensi! ('+str(delta)+', max '+str(deltamax)+')') consensus = (ali1[:start2] + ali2[start2:]).replace('-', '') return consensus
def align_fragments(c1, c2, VERBOSE=0): '''Align subsequence fragments''' import numpy as np from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali (score, a1, a2) = align_ladder(c1, c2, score_gapopen=-20) start2 = len(a2) - len(a2.lstrip('-')) end1 = len(a1.rstrip('-')) a1 = a1[start2: end1] a2 = a2[start2: end1] if VERBOSE >= 3: pretty_print_pairwise_ali((a1, a2), width=100, name1=fr1, name2=fr2) a1 = np.fromstring(a1, 'S1') a2 = np.fromstring(a2, 'S1') co1 = (a1 != '-').cumsum() - 1 co2 = (a2 != '-').cumsum() - 1 ind = (a1 != '-') & (a2 != '-') pos1 = co1[ind] + start2 pos2 = co2[ind] return (pos1, pos2)
def merge_sequences(seqs, skip_initial=30, accept_gaps=False, VERBOSE=0): '''Merge sequences with overlaps Parameters: seqs (list): sequences to merge skip_initial (int): trim from the beginning of overlaps because we do not really trust those bases accept_gaps (bool): accept gaps in the overlaps ''' from itertools import izip from seqanpy import align_ladder import numpy as np seqs = map(''.join, seqs) left_trim = 0 seqs_all = [] for iov, (seq1, seq2) in enumerate(izip(seqs[:-1], seqs[1:])): if VERBOSE >= 1: print 'Overlap n', iov+1 (score, ali1, ali2) = align_ladder(seq1[left_trim:], seq2, score_gapopen=-20) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) # Append first sequence until overlap seqs_all.append(ali1[:start2 + skip_initial]) # Check overlap ov1 = ali1[start2 + skip_initial: end1 - skip_initial] ov2 = ali2[start2 + skip_initial: end1 - skip_initial] if VERBOSE >= 2: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((ov1, ov2), width=100, name1='seq1', name2='seq2') if (not accept_gaps) and (('-' in ov1) or ('-' in ov2)): raise ValueError('Gaps in the overlap n. '+str(iov+1)) # Trust the first sequence until half, then the other one i_mid = len(ov1) // 2 seqs_all.append(ov1[:i_mid]) seqs_all.append(ov2[i_mid:]) # Set the left trim for the trailing sequence left_trim = len(ali2[: end1 - skip_initial].replace('-', '')) if VERBOSE >= 1: print 'Add last sequence' seqs_all.append(seq2[left_trim:]) return ''.join(seqs_all)
def build_coordinate_map(refseq, patseq, VERBOSE=0, score_gapopen=-20, **kwargs): '''Build the coordinate map Parameters **kwargs: passed to alignment function (e.g. alignment penalties) ''' from seqanpy import align_global (score, ali1, ali2) = align_global(refseq, patseq, score_gapopen=score_gapopen, **kwargs) patseq_start = len(ali2) - len(ali2.lstrip('-')) patseq_end = len(ali2.rstrip('-')) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali( [ali1[patseq_start:patseq_end], ali2[patseq_start:patseq_end]], name1=refseq.name, name2=patseq.name) # Bijective map mapbi = [] pos_ref = patseq_start pos_ini = 0 for col in xrange(patseq_start, patseq_end): nuc_ref = ali1[col] nuc_ini = ali2[col] if (nuc_ref != '-') and (nuc_ini != '-'): mapbi.append((pos_ref, pos_ini)) pos_ref += 1 pos_ini += 1 elif (nuc_ref != '-'): pos_ref += 1 elif (nuc_ini != '-'): pos_ini += 1 return mapbi
def check_reference_overlap(p, VERBOSE=0): '''Check whether the reference from the various fragments overlap correctly''' from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali fragments = ['F' + str(i + 1) for i in xrange(6)] title = 'Overlaps' line = ('{:<' + str(title_len) + '}').format(title + ':') stati = [] for i in xrange(len(fragments) - 1): ref1 = p.get_reference(fragments[i]) ref2 = p.get_reference(fragments[i + 1]) (score, ali1, ali2) = align_ladder(ref1, ref2, score_gapopen=-10, score_gapext=-1) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) if VERBOSE >= 4: pretty_print_pairwise_ali((ali1[start2:end1], ali2[start2:end1]), name1=fragments[i], name2=fragments[i + 1], width=100) if ali1[start2:end1].count('-') == ali2[start2:end1].count('-'): status = 'OK' else: status = 'GAPS' import ipdb ipdb.set_trace() line = line+fragments[i]+': '+\ ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+' ' stati.append(status) print line if 'GAPS' in stati: raise ValueError('GAPS status found')
def check_reference_overlap(p, VERBOSE=0): '''Check whether the reference from the various fragments overlap correctly''' from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali fragments = ['F'+str(i+1) for i in xrange(6)] title = 'Overlaps' line = ('{:<'+str(title_len)+'}').format(title+':') stati = [] for i in xrange(len(fragments) - 1): ref1 = p.get_reference(fragments[i]) ref2 = p.get_reference(fragments[i+1]) (score, ali1, ali2) = align_ladder(ref1, ref2, score_gapopen=-10, score_gapext=-1) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) if VERBOSE >= 4: pretty_print_pairwise_ali((ali1[start2: end1], ali2[start2: end1]), name1=fragments[i], name2=fragments[i+1], width=100) if ali1[start2: end1].count('-') == ali2[start2: end1].count('-'): status = 'OK' else: status = 'GAPS' import ipdb; ipdb.set_trace() line = line+fragments[i]+': '+\ ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+' ' stati.append(status) print line if 'GAPS' in stati: raise ValueError('GAPS status found')
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of ' + fea.id + ' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in ' + fea.id) if '-' in seq: raise ValueError('Gaps found in ' + fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in ' + fea.id) if 'X' in prot: raise ValueError('X amino acids found in ' + fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of ' + fea.id + ' looks different from HXB2')
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of '+fea.id+' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in '+fea.id) if '-' in seq: raise ValueError('Gaps found in '+fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in '+fea.id) if 'X' in prot: raise ValueError('X amino acids found in '+fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
def align_to_reference(seq, refstr, VERBOSE=0, codon_align=False, require_full_cover=True): '''Align sequence to refernce, stripping reference gaps''' import numpy as np from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from seqanpy import align_overlap, align_local from hivwholeseq.utils.sequence import pretty_print_pairwise_ali seqstr = ''.join(seq).upper() n_amb = len(seqstr) - sum(map(seqstr.count, ('A', 'C', 'G', 'T', '-'))) if n_amb > 2: raise ValueError('Too many ambiguous sites') def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start: end] alirt = alir[start: end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt) (alis, alir) = align_dna(seqstr, refstr, require_full_cover=require_full_cover) if codon_align: (alis, alir) = align_codon_pairwise(alis.replace('-', ''), alir.replace('-', '')) if require_full_cover: # If the sequence is shorter than HXB2, skip if '-' in (alis[0], alis[-1]): raise ValueError('The sequence does not fully cover the region') # If the sequence has too much gapping close to the edges, it's also short if (alis[:15].count('-') > 5) or (alis[-15:].count('-') > 5): raise ValueError('The sequence does not fully cover the region') else: # Put N instead of gaps at the edges first_nongap = len(alis) - len(alis.lstrip('-')) last_nongap = len(alis.rstrip('-')) - 1 alis = (('N' * first_nongap) + alis[first_nongap: last_nongap + 1] + ('N' * (len(alis) - 1 - last_nongap))) if VERBOSE >= 2: pretty_print_pairwise_ali((alis, alir), width=100, name2=refname, name1=seq.name) # Strip gaps in HXB2 alism = np.fromstring(alis, 'S1') alirm = np.fromstring(alir, 'S1') ind = (alirm != '-') seq_aliref = ''.join(alism[ind]) rec = SeqRecord(Seq(seq_aliref, seq.seq.alphabet), id=seq.id, name=seq.name, description=seq.description) return rec
# NOTE: Take only the most distant read of a pair print irp, dpair i = dpair.argmax() d = dpair[i] edge = edgepair[i] seq = seqpair[i] (score, ali1, ali2) = align_global(seq, consrec[edge[0]: edge[1]]) scoremax = 3 * len(ali1) delta = scoremax - score ali = [ali2, ali1] print 'Alignment to its own consensus (delta = '+str(delta)+')' pretty_print_pairwise_ali(ali, 'cons', 'read'+str(i+1)+' '+str(edge), len_name=25, width=90) print '' # Compare to all consensi and find the closest alifr = alis[fragment] alifrpw = [] for cons in alifr: alifrpw.append(align_overlap(cons.seq.ungap('-'), seq)) scores = map(itemgetter(0), alifrpw) indmax = np.argmax(scores) alimax = alifrpw[indmax][1:] start = len(alimax[1]) - len(alimax[1].lstrip('-')) end = len(alimax[1].rstrip('-')) alimax = [s[start: end] for s in alimax]
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0): '''Merge the allele counts of all fragments Note: we do not require full coverage of all fragments, the missing ones will just have zero counts. Sometimes, cherry-picking the data fragment by fragment might be a better choice. ''' from hivwholeseq.utils.miseq import alpha, read_types from seqanpy import align_overlap ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int) pos_ref = 1000 for (fr, ref, acsi) in acs: # Find the coordinates (score, ali1, ali2) = align_overlap( ref_genomewide[pos_ref - 1000:], ref, #score_gapopen=-20, ) fr_start = len(ali2) - len(ali2.lstrip('-')) fr_end = len(ali2.rstrip('-')) if VERBOSE: print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end # Scan the alignment pos_ref = pos_ref - 1000 + fr_start fr_start_ref = pos_ref fr_end_ref = pos_ref + fr_end - fr_start pos_fr = 0 for pos_ali in xrange(fr_start, fr_end): # Gap in genomewise, ignore position if ali1[pos_ali] == '-': pos_fr += 1 continue # Gap in fragment, ignore FIXME: probably we should put deletions elif ali2[pos_ali] == '-': pos_ref += 1 continue # Add the counts # NOTE: all fragments are treated the same, even in case of coverage # differences of orders of magnitude. This means, larger coverage # always wins. Maybe we want to implement this somewhat differently ac[:, :, pos_ref] += acsi[:, :, pos_fr] pos_fr += 1 pos_ref += 1 if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali cons = alpha[ac.sum(axis=0).argmax(axis=0)] pretty_print_pairwise_ali( (ali1[fr_start:fr_end], cons[fr_start:fr_end]), name1='gw', name2=fr, width=100) return ac
def filter_contamination( bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs ): """Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function """ import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if "score_match" in kwargs: score_match = kwargs["score_match"] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam" contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")" with pysam.Samfile(bamfilename, "rb") as bamfile: with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile( bamfilename_trash, "wb", template=bamfile ) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write("\x1b[1A") print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print "Read is very close to its own consensus", scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read") continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print "Read is closest to its consensus", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") # The read may come from another consensus (contamination) elif delta_read <= deltascore_max_other: n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print "Contaminated read found! Good:", n_good, "cont:", sum( n_cont.itervalues() ), "sources:", n_cont if VERBOSE >= 3: print "Read is contaminated by", contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read") print "" pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") if VERBOSE >= 2: print "" break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print "Read is close to nothing really", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
def merge_fragments(sequences, name='', VERBOSE=0): '''Merge references at overlapping pairs''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali consensus = [] seq_old = ''.join(sequences['F1']) for i in xrange(5): seq_new = ''.join(sequences['F'+str(i+2)]) (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10) if VERBOSE >= 3: pretty_print_pairwise_ali([ali1, ali2], name1='F'+str(i+1), name2='F'+str(i+2)) # Overlap: the first sequence is better at the start, the second at the end end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) len_overlap = end1 - start2 # There might a too short consensus, just join them with N if len_overlap < 50: consensus.append(seq_old) consensus.append('N' * 10) if i == 4: consensus.append(seq_new) else: seq_old = seq_new continue overlap1 = np.fromstring(ali1[start2: end1], 'S1') overlap2 = np.fromstring(ali2[start2: end1], 'S1') overlap = overlap1.copy() ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0] for j in ind_overlap_mismatch: if j < len(overlap) // 3: continue elif j < 2 * len(overlap) // 3: overlap[j] = 'N' else: overlap[j] = overlap2[j] overlap = overlap.tostring() consensus.append(ali1[:start2]) consensus.append(overlap) if i == 4: consensus.append(ali2[end1:]) else: seq_old = ali2[end1:].replace('-', '') consensus = ''.join(consensus) cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna), id=name, name=name, description=name+', genomewide') return cons_rec
def merge_fragments(sequences, name='', VERBOSE=0): '''Merge references at overlapping pairs''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali consensus = [] seq_old = ''.join(sequences['F1']) for i in xrange(5): seq_new = ''.join(sequences['F' + str(i + 2)]) (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10) if VERBOSE >= 3: pretty_print_pairwise_ali([ali1, ali2], name1='F' + str(i + 1), name2='F' + str(i + 2)) # Overlap: the first sequence is better at the start, the second at the end end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) len_overlap = end1 - start2 # There might a too short consensus, just join them with N if len_overlap < 50: consensus.append(seq_old) consensus.append('N' * 10) if i == 4: consensus.append(seq_new) else: seq_old = seq_new continue overlap1 = np.fromstring(ali1[start2:end1], 'S1') overlap2 = np.fromstring(ali2[start2:end1], 'S1') overlap = overlap1.copy() ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0] for j in ind_overlap_mismatch: if j < len(overlap) // 3: continue elif j < 2 * len(overlap) // 3: overlap[j] = 'N' else: overlap[j] = overlap2[j] overlap = overlap.tostring() consensus.append(ali1[:start2]) consensus.append(overlap) if i == 4: consensus.append(ali2[end1:]) else: seq_old = ali2[end1:].replace('-', '') consensus = ''.join(consensus) cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna), id=name, name=name, description=name + ', genomewide') return cons_rec
def align_to_reference(seq, refstr, VERBOSE=0, codon_align=False, require_full_cover=True): '''Align sequence to refernce, stripping reference gaps''' import numpy as np from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from seqanpy import align_overlap, align_local from hivwholeseq.utils.sequence import pretty_print_pairwise_ali seqstr = ''.join(seq).upper() n_amb = len(seqstr) - sum(map(seqstr.count, ('A', 'C', 'G', 'T', '-'))) if n_amb > 2: raise ValueError('Too many ambiguous sites') def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start:end] alirt = alir[start:end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt) (alis, alir) = align_dna(seqstr, refstr, require_full_cover=require_full_cover) if codon_align: (alis, alir) = align_codon_pairwise(alis.replace('-', ''), alir.replace('-', '')) if require_full_cover: # If the sequence is shorter than HXB2, skip if '-' in (alis[0], alis[-1]): raise ValueError('The sequence does not fully cover the region') # If the sequence has too much gapping close to the edges, it's also short if (alis[:15].count('-') > 5) or (alis[-15:].count('-') > 5): raise ValueError('The sequence does not fully cover the region') else: # Put N instead of gaps at the edges first_nongap = len(alis) - len(alis.lstrip('-')) last_nongap = len(alis.rstrip('-')) - 1 alis = (('N' * first_nongap) + alis[first_nongap:last_nongap + 1] + ('N' * (len(alis) - 1 - last_nongap))) if VERBOSE >= 2: pretty_print_pairwise_ali((alis, alir), width=100, name2=refname, name1=seq.name) # Strip gaps in HXB2 alism = np.fromstring(alis, 'S1') alirm = np.fromstring(alir, 'S1') ind = (alirm != '-') seq_aliref = ''.join(alism[ind]) rec = SeqRecord(Seq(seq_aliref, seq.seq.alphabet), id=seq.id, name=seq.name, description=seq.description) return rec
def filter_contamination(bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs): '''Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function ''' import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if 'score_match' in kwargs: score_match = kwargs['score_match'] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam' contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print 'Scanning reads (' + str( get_number_reads(bamfilename) // 2) + ')' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \ pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write('\x1b[1A') print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print 'Read is very close to its own consensus', scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='ref', name2='read') continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print 'Read is closest to its consensus', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') # The read may come from another consensus (contamination) elif (delta_read <= deltascore_max_other): n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print 'Contaminated read found! Good:', n_good, 'cont:', sum( n_cont.itervalues()), 'sources:', n_cont if VERBOSE >= 3: print 'Read is contaminated by', contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='self', name2='read') print '' pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') if VERBOSE >= 2: print '' break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print 'Read is close to nothing really', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
if do_genomewide: seqs = [patient.get_reference('F'+str(i)) for i in xrange(1, 7)] seq = merge_sequences_fragments(seqs, VERBOSE=VERBOSE) seq = SeqRecord(Seq(seq, ambiguous_dna), id=pname+'_genomewide', name=pname+'_genomewide', description='Genomewide reference for patient '+pname) ref = patient.get_reference('genomewide') if VERBOSE >= 2: from seqanpy import align_global from hivwholeseq.utils.sequence import pretty_print_pairwise_ali (score, ali1, ali2) = align_global(ref, seq, score_gapopen=-20) pretty_print_pairwise_ali((ali1, ali2), name1='Old ref', name2='New ref', width=100) # TODO: resplit sequences to make sure we cover the whole F5a, F3c, # etc. THIS CHANGES THE COORDINATES! if use_save: fn = patient.get_reference_filename('genomewide', 'fasta') fn_old = fn.replace('.fasta', '_old.fasta') save_protect(fn, fn_old, VERBOSE=VERBOSE)