afs = afs[:, :2718] # Find consensus sequence consind = afs.argmax(axis=0) consensus = alpha[consind] # Some seqs are not viable and include frameshifts that mess up the # translation, hence we restrict to positions for which gaps are a minority is_gap = consensus == '-' # Exclude full codons tmp = np.unique(is_gap.nonzero()[0] / 3) is_gap[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True # Exclude stop codons is_stop = np.zeros_like(is_gap) tmp = (translate(consensus) == '*').nonzero()[0] is_stop[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True ## Plot base prevalence #for i in xrange(4): # plt.plot(np.arange(len(consensus)), afs[i], # lw=1.5, alpha=0.5) #plt.xlim(0, 2600) #plt.ylim(-0.05, 1.25) #plt.xlabel('position in '+gene) #plt.ylabel('allele frequency') #plt.legend(alpha, loc=9) # Good are codons with no gaps and no stops is_good = (-is_gap) & (-is_stop)
afs = afs[:, :2718] # Find consensus sequence consind = afs.argmax(axis=0) consensus = alpha[consind] # Some seqs are not viable and include frameshifts that mess up the # translation, hence we restrict to positions for which gaps are a minority is_gap = consensus == '-' # Exclude full codons tmp = np.unique(is_gap.nonzero()[0] / 3) is_gap[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True # Exclude stop codons is_stop = np.zeros_like(is_gap) tmp = (translate(consensus) == '*').nonzero()[0] is_stop[np.concatenate([tmp * 3, tmp * 3 + 1, tmp * 3 + 2])] = True # Good are codons with no gaps and no stops is_good = (-is_gap) & (-is_stop) # For each codon, calculate the entropy msa = msa[:, is_good] consaa = translate(consensus[is_good]) entropy = np.zeros(len(consaa)) from collections import Counter for i, aa in enumerate(consaa): tmp = msa[:, i * 3:(i + 1) * 3] count = Counter(map(''.join, tmp)) abundances = [] for (cod, abundance) in count.iteritems():
is_mut = np.array( [alpha[al[0]] != consensus[al[1]] for al in alleles], bool) alleles = alleles[is_mut] # Keep only synonymous all_cla = {x: [] for x in classes} if len(alleles): is_syn = np.zeros(len(alleles), bool) for j, al in enumerate(alleles): pos = al[1] mut = alpha[al[0]] codcons = consensus[pos - pos % 3:pos - pos % 3 + 3] cod = codcons.copy() cod[pos % 3] = mut aacons = translate(codcons) aa = translate(cod) is_syn[j] = (aacons == aa) alleles = alleles[is_syn] # Test more stringently for synonymity # (we want to avoid double-hits in one single codon) if len(alleles): seqs = np.array(p.seqs_from_visit(p.visit[i])) is_single = np.zeros(len(alleles), bool) for j, al in enumerate(alleles): pos = al[1] mut = alpha[al[0]] # Check whether sequences have double-hits codcons = consensus[pos - pos % 3:pos - pos % 3 + 3]
if not len(alleles): continue ############################################################### # Filter only synonymous/nonsynonymous changes ############################################################### # First test is_syn = np.zeros(len(alleles), bool) for j, al in enumerate(alleles): pos = al[1] mut = alpha[al[0]] codcons = consensus[pos - pos % 3:pos - pos % 3 + 3] cod = codcons.copy() cod[pos % 3] = mut is_syn[j] = (translate(codcons) == translate(cod)) alleles_syn = alleles[is_syn] alleles_nonsyn = alleles[-is_syn] # Test more stringently (avoid double-hits in one single codon) if len(alleles_syn): seqs = np.array(p.seqs_from_visit(p.visit[i])) is_single = np.zeros(len(alleles_syn), bool) for j, al in enumerate(alleles_syn): pos = al[1] mut = alpha[al[0]] # Check whether sequences have double-hits # If a double mutant is *ever* observed, discard the allele. # Note: This is a very conservative measure and must # be avoided when estimating densities.