def pairwise_align_dna(sequence, reference, regex_complied, gene): """ Pairwise align sequence to reference, to find reading frame and frame-shift in/dels :param sequence: (str) a query DNA sequence :param reference: (str) a reference DNA sequence (must start in reading frame 1) :param regex_complied: (regex_obj) a compiled regex pattern :param gene: (str) the target gene (ENV, GAG, POL, etc... :return: (str) aligned query sequence, (str) aligned ref sequence, (int) reading frame for query sequence """ # do overlap pairwise alignment to not get truncated query sequence if gene == "ENV": overlap = seqanpy.align_overlap(sequence, reference, band=-1, score_match=4, score_mismatch=-1, score_gapext=-3, score_gapopen=-14) else: # for other regions overlap = seqanpy.align_overlap(sequence, reference, band=-1, score_match=4, score_mismatch=-2, score_gapext=-3, score_gapopen=-14) overlap = list(overlap) seq_align = overlap[1] ref_align = overlap[2] # print(">sqseq1\n{}\n".format(seq_align)) # print(">sqref1\n{}\n".format(ref_align)) # get start position in the seq, if not starting at index 0 if seq_align[0] == '-': seq_start = regex_complied.search(seq_align).end() else: seq_start = 0 # get end position in the seq, if not starting at index 0 if seq_align[-1] == '-': # reverse the string and find first non-gap character, mult match.end by -1 to get non-reversed index seq_end = (regex_complied.search(seq_align[::-1]).end()) * -1 else: seq_end = None # ref start will be 0 for align_overlap if ref_align[0] == '-': ref_start = regex_complied.search(ref_align).end() else: ref_start = 0 # calculate reading frame (reference must start in frame 0) frame = (seq_start - ref_start) % 3 # truncate the overlap alignment to the region of interest seq_align = seq_align[seq_start:seq_end] ref_align = ref_align[seq_start:seq_end] # print(">sqseq2\n{}\n".format(seq_align)) # print(">sqref2\n{}\n".format(ref_align)) return seq_align, ref_align, frame
def get_consensus(self, region, PCR=1): '''Get consensus for this sample''' from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO from seqanpy import align_overlap (fragment, start, stop) = self.get_fragmented_roi(region, VERBOSE=0, include_genomewide=True) seq = SeqIO.read(self.get_consensus_filename(fragment, PCR=PCR), 'fasta') refseq = SeqIO.read( self.get_reference_filename(fragment, format='fasta'), 'fasta')[start:stop] score, ali1, ali2 = align_overlap(seq, refseq) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) seq_region = ali1[start:end] seq = SeqRecord(Seq(seq_region, seq.seq.alphabet), id=self.name, name=self.name, description=self.name) return seq
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0): '''Merge the allele counts of all fragments Note: we do not require full coverage of all fragments, the missing ones will just have zero counts. Sometimes, cherry-picking the data fragment by fragment might be a better choice. ''' from hivwholeseq.utils.miseq import alpha, read_types from seqanpy import align_overlap ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int) pos_ref = 1000 for (fr, ref, acsi) in acs: # Find the coordinates (score, ali1, ali2) = align_overlap(ref_genomewide[pos_ref - 1000:], ref, #score_gapopen=-20, ) fr_start = len(ali2) - len(ali2.lstrip('-')) fr_end = len(ali2.rstrip('-')) if VERBOSE: print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end # Scan the alignment pos_ref = pos_ref - 1000 + fr_start fr_start_ref = pos_ref fr_end_ref = pos_ref + fr_end - fr_start pos_fr = 0 for pos_ali in xrange(fr_start, fr_end): # Gap in genomewise, ignore position if ali1[pos_ali] == '-': pos_fr += 1 continue # Gap in fragment, ignore FIXME: probably we should put deletions elif ali2[pos_ali] == '-': pos_ref += 1 continue # Add the counts # NOTE: all fragments are treated the same, even in case of coverage # differences of orders of magnitude. This means, larger coverage # always wins. Maybe we want to implement this somewhat differently ac[:, :, pos_ref] += acsi[:, :, pos_fr] pos_fr += 1 pos_ref += 1 if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali cons = alpha[ac.sum(axis=0).argmax(axis=0)] pretty_print_pairwise_ali((ali1[fr_start: fr_end], cons[fr_start: fr_end]), name1='gw', name2=fr, width=100) return ac
def get_distance_reads_sequence(seq, reads, VERBOSE=0, score_match=3, score_mismatch=-3): '''Get the distance in alignment score between read pairs and a sequence''' from seqanpy import align_overlap seqs = ''.join(seq) deltas = [] for irp, read_pair in enumerate(reads): d = 0 for read in read_pair: (score, alis, alir) = align_overlap(seqs, read.seq, score_match=score_match, score_mismatch=score_mismatch) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) scoremax = score_match * (end - start) delta = scoremax - score d += delta deltas.append(d) return deltas
def check_suspect(reads, consensi_foreign, deltamax=30, VERBOSE=0): '''Check suspicious reads for closer distance to potential contaminants''' if VERBOSE >= 2: print 'Checking suspect read pair:', reads[0].qname, for consensus in consensi_foreign: conss = ''.join(consensus) delta_foreign = 0 for read in reads: (score, ali1, ali2) = align_overlap(conss, read.seq) scoremax = 3 * len(ali2.strip('-')) delta_foreign += scoremax - score # We classify as trash all reads that are within a basin of another seq if delta_foreign < deltamax: if VERBOSE >= 2: print '' print consensus.name, delta_foreign return True else: if VERBOSE >= 2: print 'OK', if VERBOSE >= 2: print '' return False
def ammend_fasta(): with open(fname_unspliced, 'w') as ofile: strain_by_protein = defaultdict(dict) for seq in SeqIO.parse(fname, 'fasta'): SeqIO.write(seq, ofile, 'fasta') prot = seq.name.split('_')[0] seq_name = seq.description.split('|')[1] strain_by_protein[prot][seq_name] = seq joined_seqs = defaultdict(dict) splice_pairs = [('M', 'M1', 'M2'), ('NS', 'NS1', 'NS2')] #splice_pairs = [('M', 'M1', 'BM2')] from seqanpy import align_overlap for c, a,b in splice_pairs: for strain in strain_by_protein[a]: seq1 = strain_by_protein[a][strain] new_id = c+seq1.id.lstrip(a) new_description = c+seq1.description.lstrip(a) new_description = new_description.replace(a, c) new_name = c+seq1.name.lstrip(a) new_name = new_name.replace(a, c) try: score, ali1, ali2 = align_overlap(seq1.seq, strain_by_protein[b][strain].seq, score_gapopen=-20, score_gapext=0) ali_array = np.array([np.fromstring(x, 'S1') for x in [ali1, ali2]]) tmpseq = np.copy(ali_array[0]) tmpseq[ali_array[0]=='-']=ali_array[1][ali_array[0]=='-'] joined_seqs[c][strain] = SeqRecord.SeqRecord(seq=Seq.Seq("".join(tmpseq)), id=new_id, description=new_description, name=new_name) SeqIO.write(joined_seqs[c][strain], ofile, 'fasta') except: print(seq1.name, "doesn't have a partner")
def determine_lineage(seq): fields = map(lambda x:x.strip(), seq.description.split('|')) tmp_lineage = (fields[2], fields[4]) if tmp_lineage in patterns: print fields[0],"\n\tgisaid defined lineage:",tmp_lineage,'->',patterns[tmp_lineage] return patterns[tmp_lineage] else: scores = [] for olineage, oseq in outgroups.iteritems(): if (params.aligner == "seqan"): from seqanpy import align_overlap tmp_aln = align_overlap(str(oseq.seq), str(seq.seq).replace('-','').upper(), score_gapopen=-10, score_gapext=-1) tmp_aln = np.array([np.fromstring(tmp_aln[1], dtype='|S1'), np.fromstring(tmp_aln[2], dtype='|S1')]) if (params.aligner == "mafft"): SeqIO.write([oseq, seq], "temp_in.fasta", "fasta") os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp") tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta')) scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum())) scores.sort(key = lambda x:x[1], reverse=True) if scores[0][1]>0.85*len(seq): print fields[0], tmp_lineage, len(seq), "\n\t lineage based on similarity:",scores[0][0],"\n\t",scores return scores[0][0] else: print fields[0], tmp_lineage, len(seq), "\n\t other: best scores:",scores[0] return 'other'
def align_to_initial(reads, p): from hivevo.sequence import alphal seg, start, stop = 'F1', int(p.annotation['p17'].location.start)-20, int(p.annotation['p17'].location.end)+7 ref_seq = "".join(p.get_initial_sequence(seg)[start:stop]) aft = p.get_allele_frequency_trajectories('F1')[:,:,start:stop] founder_indices = np.array([alphal.index(nuc) for nuc in ref_seq]) for read in reads: score, ali_ref, ali_read = align_overlap(ref_seq, str(read.seq.ungap('-'))) tmp_ali = np.vstack([np.fromstring(a, 'S1') for a in (ali_ref, ali_read)]) try: unconserved = ~((aft[:,:4,:].max(axis=1)>0.99).all(axis=0)) reference_aln = (tmp_ali[0]!='-') ungapped = (~np.any(tmp_ali=='-', axis=0))[reference_aln] unamb = (~np.any(tmp_ali=='N', axis=0))[reference_aln] good_positions = ungapped&unamb&unconserved read_indices = np.array([alphal.index(nuc) for nuc in tmp_ali[1]])[reference_aln][good_positions] read.prob = np.sum(np.log(aft[:, read_indices, good_positions] + 0.001), axis=1) good_positions = ungapped&unamb good_positions[:20]=False good_positions[-7:]=False read_indices = np.array([alphal.index(nuc) for nuc in tmp_ali[1]])[reference_aln][good_positions] read.distance = np.mean((founder_indices[good_positions]!=read_indices)) except: import ipdb; ipdb.set_trace();
def determine_lineage(seq): fields = map(lambda x:x.strip(), seq.description.split('|')) tmp_lineage = (fields[2], fields[4]) if tmp_lineage in patterns: # print fields[0],"\n\tgisaid defined lineage:",tmp_lineage,'->',patterns[tmp_lineage] return patterns[tmp_lineage] else: scores = [] for olineage, oseq in outgroups.iteritems(): if (params.aligner == "seqan"): from seqanpy import align_overlap tmp_aln = align_overlap(str(oseq.seq), str(seq.seq).replace('-','').upper(), score_gapopen=-10, score_gapext=-1) tmp_aln = np.array([np.fromstring(tmp_aln[1], dtype='|S1'), np.fromstring(tmp_aln[2], dtype='|S1')]) if (params.aligner == "mafft"): SeqIO.write([oseq, seq], "temp_in.fasta", "fasta") os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp") tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta')) scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum())) scores.sort(key = lambda x:x[1], reverse=True) if scores[0][1]>0.85*len(seq): print fields[0], tmp_lineage, len(seq), "\n\t lineage based on similarity:",scores[0][0],"\n\t",scores return scores[0][0] else: print fields[0], tmp_lineage, len(seq), "\n\t other: best scores:",scores[0] return 'other'
def get_gene_positions_in_fragment(gene, fragment, gwseq, fragseq, VERBOSE=0): '''Get the coordinates of a gene within a fragment''' # Find coordinates of gene in reference feagene = gwseq.features[map(attrgetter('id'), gwseq.features).index(gene)] gene_start = feagene.location.nofuzzy_start gene_end = feagene.location.nofuzzy_end # Sanity check on coordinates feafrag = refseq.features[map(attrgetter('id'), gwseq.features).index(fragment)] fragrefgw = feafrag.extract(gwseq) if len(fragseq) != len(fragrefgw): raise ValueError( 'Problem with coordinates between fragment and genomewide.') # Find coordinates of gene in fragment frag_start = feafrag.location.nofuzzy_start frag_end = feafrag.location.nofuzzy_end # complete gene if (frag_start <= gene_start) and (frag_end >= gene_end): if VERBOSE >= 2: print 'Complete gene found' positions = np.arange(gene_start, gene_end) - frag_start # start of gene elif (frag_start <= gene_start): if VERBOSE >= 2: print 'WARNING: only gene start found' positions = np.arange(gene_start, frag_end) - frag_start if len(positions) % 3: positions = positions[:-(len(positions) % 3)] # end of gene elif (frag_end >= gene_end): if VERBOSE >= 2: print 'WARNING: only gene end found' positions = np.arange(frag_start, gene_end) - frag_start if len(positions) % 3: positions = positions[len(positions) % 3:] # middle of gene: guess reading frame else: if VERBOSE >= 2: print 'WARNING: only gene middle found' prot = feagene.extract(gwseq).seq.translate() ali_score = [] for rf_start in xrange(3): tmpseq = fragseq[rf_start:].seq if len(tmpseq) % 3: tmpseq = tmpseq[:-(len(tmpseq) % 3)] tmpprot = tmpseq.translate() (score, ali1, ali2) = align_overlap(tmpprot, prot) ali_score.append(score) rf_start = np.argmax(ali_score) positions = np.arange(frag_start + rf_start, frag_end) - frag_start if len(positions) % 3: positions = positions[:-(len(positions) % 3)] return positions
def get_gene_positions_in_fragment(gene, fragment, gwseq, fragseq, VERBOSE=0): """Get the coordinates of a gene within a fragment""" # Find coordinates of gene in reference feagene = gwseq.features[map(attrgetter("id"), gwseq.features).index(gene)] gene_start = feagene.location.nofuzzy_start gene_end = feagene.location.nofuzzy_end # Sanity check on coordinates feafrag = refseq.features[map(attrgetter("id"), gwseq.features).index(fragment)] fragrefgw = feafrag.extract(gwseq) if len(fragseq) != len(fragrefgw): raise ValueError("Problem with coordinates between fragment and genomewide.") # Find coordinates of gene in fragment frag_start = feafrag.location.nofuzzy_start frag_end = feafrag.location.nofuzzy_end # complete gene if (frag_start <= gene_start) and (frag_end >= gene_end): if VERBOSE >= 2: print "Complete gene found" positions = np.arange(gene_start, gene_end) - frag_start # start of gene elif frag_start <= gene_start: if VERBOSE >= 2: print "WARNING: only gene start found" positions = np.arange(gene_start, frag_end) - frag_start if len(positions) % 3: positions = positions[: -(len(positions) % 3)] # end of gene elif frag_end >= gene_end: if VERBOSE >= 2: print "WARNING: only gene end found" positions = np.arange(frag_start, gene_end) - frag_start if len(positions) % 3: positions = positions[len(positions) % 3 :] # middle of gene: guess reading frame else: if VERBOSE >= 2: print "WARNING: only gene middle found" prot = feagene.extract(gwseq).seq.translate() ali_score = [] for rf_start in xrange(3): tmpseq = fragseq[rf_start:].seq if len(tmpseq) % 3: tmpseq = tmpseq[: -(len(tmpseq) % 3)] tmpprot = tmpseq.translate() (score, ali1, ali2) = align_overlap(tmpprot, prot) ali_score.append(score) rf_start = np.argmax(ali_score) positions = np.arange(frag_start + rf_start, frag_end) - frag_start if len(positions) % 3: positions = positions[: -(len(positions) % 3)] return positions
def trim_to_refseq(seq, refseq): '''Trim sequence to a reference sequence''' from seqanpy import align_overlap (score, ali1, ali2) = align_overlap(seq, refseq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) return seq[start: end]
def align_pairwise(seq1, seq2): try: from seqanpy import align_overlap return align_overlap(seq1, seq2, **scoring_params) except ImportError: from Bio import pairwise2 aln = pairwise2.align.globalms(seq1, seq2, scoring_params['score_match'], scoring_params['score_mismatch'], scoring_params['score_gapopen'], scoring_params['score_gapext'], penalize_end_gaps=False, one_alignment_only=True)[0] return aln[2], aln[0], aln[1]
def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start: end] alirt = alir[start: end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt)
def align_dna(seqstr, refstr, require_full_cover=True): if require_full_cover: (score, alis, alir) = align_overlap(seqstr, refstr) start = len(alir) - len(alir.lstrip('-')) end = len(alir.rstrip('-')) alist = alis[start:end] alirt = alir[start:end] else: (score, alis, alir) = align_local(seqstr, refstr) reftrim = alir.replace('-', '') start = refstr.find(reftrim[:50]) end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:]) alist = ('N' * start) + alis + ('N' * (len(refstr) - end)) alirt = refstr[:start] + alir + refstr[end:] return (alist, alirt)
# Script if __name__ == '__main__': # Try import import seqanpy as sap # Global pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'AAATCGA' output = sap.align_global(seq1, seq2, band=5) print output # Overlap pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2) print output # Overlap pairwise alignment cutting flanks seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2, cut_flanks=True) print output # Ladder pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'TCTAGGGAAACCC' output = sap.align_ladder(seq1, seq2) print output # Local pairwise alignment
def overlap_test(): print('Test align_overlap') import seqanpy (score, ali1, ali2) = seqanpy.align_overlap('ACCGT', 'CCG') assert ali1 == 'ACCGT' assert ali2 == '-CCG-'
scoremax = 3 * len(ali1) delta = scoremax - score ali = [ali2, ali1] print 'Alignment to its own consensus (delta = '+str(delta)+')' pretty_print_pairwise_ali(ali, 'cons', 'read'+str(i+1)+' '+str(edge), len_name=25, width=90) print '' # Compare to all consensi and find the closest alifr = alis[fragment] alifrpw = [] for cons in alifr: alifrpw.append(align_overlap(cons.seq.ungap('-'), seq)) scores = map(itemgetter(0), alifrpw) indmax = np.argmax(scores) alimax = alifrpw[indmax][1:] start = len(alimax[1]) - len(alimax[1].lstrip('-')) end = len(alimax[1].rstrip('-')) alimax = [s[start: end] for s in alimax] score = scores[indmax] scoremax = 3 * len(alimax[0]) delta = scoremax - score name1 = ' '.join(['cons '] + alifr[indmax].name.split('_')[::2]) name2 = ' '.join(['read'+str(i+1), pname, sample['patient sample']]) print 'Alignment to best consensus (delta = '+str(delta)+')' pretty_print_pairwise_ali(alimax, name1, name2, len_name=25, width=90)
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0): '''Get histogram of minimal distance of reads from consensi''' conssi = map(''.join, consensi) m = np.zeros(len(consensi), int) n_good = 0 with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if n_good == maxreads: break if VERBOSE >= 3: print n_good + 1, 'Checking mindist for:', reads[0].qname, # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' continue n_good += 1 # Get all distances ds_pair = np.zeros_like(m) for ic, consensus in enumerate(consensi): conss = conssi[ic] dpair = 0 for read in reads: seq = read.seq ali = align_overlap(conss, seq) # NOTE: it is possible that we start before conss' start or end after # its end, but that IS evidence that it's not contamination from there. pos = conss.find(ali[1].replace('-', '')) alim0 = np.fromstring(ali[1], 'S1') alim1 = np.fromstring(ali[2], 'S1') # Score subst d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum() # Score insertions gaps = alim0 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders += alim0[0] == '-' n_gaps_borders += alim0[-1] == '-' n_insertions = n_gaps_borders // 2 d += n_insertions # Score deletions gaps = alim1 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders -= alim1[0] == '-' n_gaps_borders -= alim1[-1] == '-' n_deletions = n_gaps_borders // 2 d += n_deletions dpair += d ds_pair[ic] = dpair if VERBOSE >= 3: print 'OK', m[ds_pair.argmin()] += 1 if VERBOSE >= 3: print '' return m
def filter_contamination( bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs ): """Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function """ import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if "score_match" in kwargs: score_match = kwargs["score_match"] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam" contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")" with pysam.Samfile(bamfilename, "rb") as bamfile: with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile( bamfilename_trash, "wb", template=bamfile ) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write("\x1b[1A") print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print "Read is very close to its own consensus", scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read") continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print "Read is closest to its consensus", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") # The read may come from another consensus (contamination) elif delta_read <= deltascore_max_other: n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print "Contaminated read found! Good:", n_good, "cont:", sum( n_cont.itervalues() ), "sources:", n_cont if VERBOSE >= 3: print "Read is contaminated by", contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read") print "" pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") if VERBOSE >= 2: print "" break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print "Read is close to nothing really", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0): '''Merge the allele counts of all fragments Note: we do not require full coverage of all fragments, the missing ones will just have zero counts. Sometimes, cherry-picking the data fragment by fragment might be a better choice. ''' from hivwholeseq.utils.miseq import alpha, read_types from seqanpy import align_overlap ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int) pos_ref = 1000 for (fr, ref, acsi) in acs: # Find the coordinates (score, ali1, ali2) = align_overlap( ref_genomewide[pos_ref - 1000:], ref, #score_gapopen=-20, ) fr_start = len(ali2) - len(ali2.lstrip('-')) fr_end = len(ali2.rstrip('-')) if VERBOSE: print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end # Scan the alignment pos_ref = pos_ref - 1000 + fr_start fr_start_ref = pos_ref fr_end_ref = pos_ref + fr_end - fr_start pos_fr = 0 for pos_ali in xrange(fr_start, fr_end): # Gap in genomewise, ignore position if ali1[pos_ali] == '-': pos_fr += 1 continue # Gap in fragment, ignore FIXME: probably we should put deletions elif ali2[pos_ali] == '-': pos_ref += 1 continue # Add the counts # NOTE: all fragments are treated the same, even in case of coverage # differences of orders of magnitude. This means, larger coverage # always wins. Maybe we want to implement this somewhat differently ac[:, :, pos_ref] += acsi[:, :, pos_fr] pos_fr += 1 pos_ref += 1 if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali cons = alpha[ac.sum(axis=0).argmax(axis=0)] pretty_print_pairwise_ali( (ali1[fr_start:fr_end], cons[fr_start:fr_end]), name1='gw', name2=fr, width=100) return ac
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = { 'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges } edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [ locate_gene(smat, name + suff, output_compact=True) for suff in ('1', '2') ] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [ FeatureLocation(*pos_edge) for pos_edge in pos_edges ] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems( ): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start:end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
def filter_contamination(bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs): '''Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function ''' import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if 'score_match' in kwargs: score_match = kwargs['score_match'] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam' contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print 'Scanning reads (' + str( get_number_reads(bamfilename) // 2) + ')' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \ pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write('\x1b[1A') print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print 'Read is very close to its own consensus', scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='ref', name2='read') continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print 'Read is closest to its consensus', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') # The read may come from another consensus (contamination) elif (delta_read <= deltascore_max_other): n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print 'Contaminated read found! Good:', n_good, 'cont:', sum( n_cont.itervalues()), 'sources:', n_cont if VERBOSE >= 3: print 'Read is contaminated by', contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='self', name2='read') print '' pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') if VERBOSE >= 2: print '' break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print 'Read is close to nothing really', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
# Script if __name__ == '__main__': # Try import import seqanpy as sap # Global pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'AAATCGA' output = sap.align_global(seq1, seq2, band=5) print output # Overlap pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2) print output # Ladder pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'TCTAGGGAAACCC' output = sap.align_ladder(seq1, seq2) print output # Local pairwise alignment seq1 = 'AAAGGTCTACCGTAGCCT' seq2 = 'AAGTCTAC' output = sap.align_local(seq1, seq2) print output
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = {'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges} edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [locate_gene(smat, name+suff, output_compact=True) for suff in ('1', '2')] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems(): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start: end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
] seq = ''.join(seql) s, a1, a2 = align_local(prot['seq_aa'], seq) scores.append(s) chain = list(struc.get_chains())[np.argmax(s)] seql = [d3to1.get(r.get_resname(), 'O') for r in chain.get_residues()] seq = ''.join(seql) # Flag all mutations for m, mut in muts.iterrows(): mutations.at[m, 'PDB_fn'] = fns[protname] mutations.at[m, 'PDB_id'] = fns[protname].split( '_')[1].upper().split('.')[0] mutations.at[m, 'PDB_chain'] = chain.id s, a1, a2 = align_overlap(seq, mut['context_protein']) # The focal allele is always small and is the only such letter pos_in_context = 4 pos = a2.find(mut['context_protein'][pos_in_context]) pos -= a1[:pos].count('-') mutations.at[m, 'PDB_pos_in_chain'] = pos mutations.at[m, 'PDB_allele'] = seq[pos] print('Ref: ' + mut['context_protein']) print('PDB: ' + seq[pos - 4:pos] + seq[pos].lower() + seq[pos + 1:pos + 5]) #mutations.to_csv('../data/mutations_highvariance_summary.tsv', sep='\t', index=True) print('Load multiple sequence alignments') from Bio import AlignIO protname = 'NS1'
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0): '''Get histogram of minimal distance of reads from consensi''' conssi = map(''.join, consensi) m = np.zeros(len(consensi), int) n_good = 0 with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if n_good == maxreads: break if VERBOSE >= 3: print n_good + 1, 'Checking mindist for:', reads[0].qname, # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' continue n_good += 1 # Get all distances ds_pair = np.zeros_like(m) for ic, consensus in enumerate(consensi): conss = conssi[ic] dpair = 0 for read in reads: seq = read.seq ali = align_overlap(conss, seq) # NOTE: it is possible that we start before conss' start or end after # its end, but that IS evidence that it's not contamination from there. pos = conss.find(ali[1].replace('-', '')) alim0 = np.fromstring(ali[1], 'S1') alim1 = np.fromstring(ali[2], 'S1') # Score subst d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum() # Score insertions gaps = alim0 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders += alim0[0] == '-' n_gaps_borders += alim0[-1] == '-' n_insertions = n_gaps_borders // 2 d += n_insertions # Score deletions gaps = alim1 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders -= alim1[0] == '-' n_gaps_borders -= alim1[-1] == '-' n_deletions = n_gaps_borders // 2 d += n_deletions dpair += d ds_pair[ic] = dpair if VERBOSE >= 3: print 'OK', m[ds_pair.argmin()] += 1 if VERBOSE >= 3: print '' return m
patient = load_patient(pname) refseqgw = patient.get_reference('genomewide') for fragment in fragments: if VERBOSE >= 1: print pname, fragment if VERBOSE >= 2: print 'Cutting out fragment', fragment # Get start coordinate if fragment == 'F1': start = 0 else: prfwd = primers_outer[fragment][0] (score, ali1, ali2) = align_overlap(refseqgw, prfwd, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) + len(prfwd) # Get end coordinate if fragment == 'F6': end = len(refseqgw) else: prrev = primers_outer[fragment][1] (score, ali1, ali2) = align_overlap(refseqgw, prrev, score_gapopen=-20) end = len(ali2) - len(ali2.lstrip('-')) refseq = refseqgw[start: end] refseq.id = patient.code+'_ref_'+fragment refseq.name = refseq.id refseq.description = 'Patient '+patient.code+', initial reference '+fragment