def FilterAlignedPairForPositions(seq1, seq2, method): """given the method, return a set of aligned sequences only containing certain positions. Available filters: all: do nothing. codon1,codon2,codon3: return 1st, 2nd, 3rd codon positions only. d4: only changes within fourfold-degenerate sites """ l1 = len(seq1) l2 = len(seq2) if method == "all": return seq1, seq2 elif method == "codon1": return ("".join([seq1[x] for x in range(0, l1, 3)]), "".join([seq2[x] for x in range(0, l2, 3)])) elif method == "codon2": return ("".join([seq1[x] for x in range(1, l1, 3)]), "".join([seq2[x] for x in range(1, l2, 3)])) elif method == "codon3": return ("".join([seq1[x] for x in range(2, l1, 3)]), "".join([seq2[x] for x in range(2, l2, 3)])) elif method == "d4": s1 = [] s2 = [] for x in range(0, l1, 3): codon1 = seq1[x:x + 3] codon2 = seq2[x:x + 3] try: aa1, deg11, deg12, deg13 = Genomics.GetDegeneracy(codon1) aa2, deg11, deg22, deg23 = Genomics.GetDegeneracy(codon2) except KeyError: continue if aa1 == aa2 and deg13 == 4 and deg23 == 4: s1.append(codon1[2]) s2.append(codon2[2]) return "".join(s1), "".join(s2)
def loadSequence(self, sequence, seqtype="na"): """load sequence properties from a sequence.""" SequencePropertiesLength.loadSequence(self, sequence, seqtype) if len(sequence) % 3: raise ValueError( '''sequence length is not a multiple of 3 (length=%i)''' % (len(sequence))) # uppercase all letters sequence = sequence.upper() self.mNStopCodons = 0 # setup counting arrays # nucleotide counts for each position (is not a sum of the counts # per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}, {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}, {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0}] # nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) # use generator rather than list to save memory for codon in (sequence[x:x + 3] for x in xrange(0, len(sequence), 3)): for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 except KeyError: pass
def countSites(model): """count number of expected synonymous/nonsynonymous sites in a grammar. """ ## number of synonymous/non-synonymous sites n, s = 0.0, 0.0 xpi = model.evaluateTerminalFrequencies()[('COD0', 'COD1', 'COD2')] pi = {} for codon, f in xpi.items(): pi["".join(codon).upper()] = f ## translate pi and the matrix to codons for key, value in pi.items(): del pi[key] pi["".join(key).upper()] = value for codon, freq in pi.items(): try: degeneracy = Genomics.GetDegeneracy(codon) except KeyError: continue for x in range(1, 4): d = (degeneracy[x] - 1.0) / 3.0 s += freq * d n += freq * (1.0 - d) ## if degeneracy[x] > 1: ## s += freq ## else: ## n += freq assert (float("%5.2f" % (n + s)) == 3.0) ## print s / (n+s) ## n = 184.9 ## s = 76.1 ## t = n + s ## n /= t ## s /= t ## print s / (n+s) return n, s
def Load(self, in_sequence): """load sequence properties from a sequence.""" ## uppercase all letters sequence = in_sequence.upper() self.mNCodons = len(sequence) / 3 self.mNStopCodons = 0 ## setup counting arrays ## counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 ## nucleotide counts for each position (is not a sum of the counts ## per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{ 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }] ## nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] self.mLength = len(sequence) for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 self.mCountsAA[aa] += 1 except KeyError: pass self.Update()