def Translation(seq): '''translate DNA to protein''' length = len(seq) / 3 protein = '' for i in range(length): if Codon2AA2(seq[(i * 3):((i + 1) * 3)]) == 'J': ## stop codon use * tmpAA = '*' elif Codon2AA2(seq[(i * 3):((i + 1) * 3)]) == 'Z': ## IUPAC code #print seq[(i*3):((i+1)*3)] tmp_3mer_list = IUPAC_3mer(seq[i * 3:(i + 1) * 3]) tmp_aa_list = [] for tmp_3mer in tmp_3mer_list: tmp_aa_list.append(Codon2AA2(tmp_3mer)) if len(set(tmp_aa_list)) > 1: tmpAA = 'X' ## X represents any aa elif len(set(tmp_aa_list)) == 1: tmpAA = tmp_aa_list[0] else: tmpAA = '*' else: tmpAA = Codon2AA2(seq[i * 3:(i + 1) * 3]) protein += tmpAA return protein
def IUPAC_3mer(seq): '''Return a list of all possible 3mers of the sequence''' kmer_list = [] for dna1 in _IUPAC[seq[0]]: for dna2 in _IUPAC[seq[1]]: for dna3 in _IUPAC[seq[2]]: if Codon2AA2(dna1+dna2+dna3) != "J": kmer_list.append(dna1+dna2+dna3) return kmer_list
def GetEDP(seq, transcript_len): '''get features including: ORF length, ORF ratio, ORF EDP of codon''' # entropy density Codon = {} for aa in _AA_list: Codon[aa] = 1e-9 sum_codon = 1e-9 * 20 if (len(seq) > 3): num = len(seq) / 3 for i in range(0, num): if Codon2AA2(seq[i * 3:(i + 1) * 3]) == "J": continue # consider the IUPAC codon elif Codon2AA2(seq[i * 3:(i + 1) * 3]) == "Z": tmp_kmer_list = IUPAC_3mer(seq[i * 3:(i + 1) * 3]) for tmp_kmer in tmp_kmer_list: Codon[Codon2AA2(tmp_kmer)] += 1.0 / len(tmp_kmer_list) sum_codon += 1.0 else: Codon[Codon2AA2(seq[i * 3:(i + 1) * 3])] += 1.0 sum_codon += 1.0 H = 0.0 for (k, v) in Codon.items(): Codon[k] /= sum_codon Codon[k] = -Codon[k] * np.log2(Codon[k]) H += Codon[k] EDP = {} for (k, v) in Codon.items(): EDP[k] = Codon[k] / H if EDP[k] < 1e-7: EDP[k] = 0 outline = '' for (k, v) in EDP.items(): outline += str(v) + "\t" return outline.strip()
def SixMer2AA(seq): '''Convert 6mer to 2 AA''' return Codon2AA2( seq[0:3] ) + Codon2AA2( seq[3:6] )