def pseudoTranslate(self, transl_table=1, out_type="standard", code=None): """Returns a pseudo protein alignment from *self*, a DNA alignment. The result is of datatype standard instead of protein, which allows the use of special recodings, like distinguishing between two types of serines, like in :meth:`Alignment.recode23aa()`. *self* is translated using :attribute:`Code(transl_table).code`. Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. It may be possible to use a code based on another codon length as 3, but this has not been tested as of June 2012. At the moment, we can only do translations where the sequences are phased with the coding frame, ie the first sequence position is the first position of the codon, and the last sequence position should be a last codon position. The default behaviour is to use translation table 1, that is the standard genetic code. Other available translation tables, this week:: if transl_table == 1: # standard elif transl_table == 2: # vertebrate mito elif transl_table == 4: # Mold, Protozoan, # and Coelenterate Mitochondrial Code # and the Mycoplasma/Spiroplasma Code elif transl_table == 5: # invertebrate mito elif transl_table == 9: # echinoderm mito and now 6, 10, 11, 12, 13, 14, 21. (These are found in p4.GeneticCode.py or in :class:`Code`) *transl_table* may also be provided as text consisting in blank-separated elements. Each elements consists in n characters, where n is the number of defined codons. The first element lists the coded (pseudo-)amino-acids. The second elements describes whether a codon can be a start codon ('M') or not ('-'). The other elements correspond to the (pseudo-)nucleotides at the successive codon positions. Example:: FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG ---M---------------M------------MMMM---------------M------------ TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG """ gm = ['p4.alignment_recoding.pseudoTranslate()'] if self.dataType != 'dna': gm.append("Self should be a DNA alignment") raise P4Error(gm) if code is None: #from GeneticCode import Code code = Code(transl_table, in_type="dna", out_type=out_type).code codelength = Code(transl_table).codelength else: if isinstance(code, types.StringType): code = getBiopythonCode(code) # defined in code_utils.py # We assume that the "codons" have all the same length, # and we look at the first codon in the dictionary to know this length. codelength = len(code.keys()[0]) # We use standard type, because, depending on the code used to make the translation, # we may get something that contains symbols not corresponding to normal amino-acids. out_type = "standard" if self.length % codelength != 0: gm.append("The length of self should be a multiple of %i" % codelength) raise P4Error(gm) ali = self.dupe() ali.dataType = out_type ali.length = self.length / codelength ali.symbols = CAT(sorted(set(code.values()))) ali.equates = {} ali.dim = len(ali.symbols) ali.nexusSets = None ali.parts = [] ali.excludeDelete = None for seq in ali.sequences: # Initialize an all-gap sequence. seq.sequence = ['-'] * ali.length seq.dataType = out_type for i in range(len(self.sequences)): # the original sequence dnaSeq = self.sequences[i].sequence # the future pseudo-translation pseudoProtSeq = ali.sequences[i].sequence for j in range(ali.length): theCodon = dnaSeq[(j * codelength):((j+1) * codelength)] if code.has_key(theCodon): pseudoProtSeq[j] = code[theCodon] elif theCodon == '-' * codelength: # full indel pseudoProtSeq[j] = '-' elif theCodon.count('-'): # partial indel gm.append(" seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % ( i, j, (j*codelength), theCodon)) raise P4Error(gm) else: # Should we use a CodonTranslationError (defined in code_utils.py) here ? gm.append(" seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % ( i, j, (j*codelength), theCodon)) raise P4Error(gm) for seq in ali.sequences: # Convert from list to string. #s.sequence = string.join(s.sequence, '') seq.sequence = CAT(seq.sequence) #print s.sequence return ali
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None): """Returns a pseudo protein alignment from *self*, a DNA alignment. The result is of datatype standard instead of protein, which allows the use of special recodings, like distinguishing between two types of serines, like in :meth:`Alignment.recode23aa()`. *self* is translated using :attribute:`Code(transl_table).code`. Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. It may be possible to use a code based on another codon length as 3, but this has not been tested as of June 2012. At the moment, we can only do translations where the sequences are phased with the coding frame, ie the first sequence position is the first position of the codon, and the last sequence position should be a last codon position. The default behaviour is to use translation table 1, that is the standard genetic code. Other available translation tables, this week:: if transl_table == 1: # standard elif transl_table == 2: # vertebrate mito elif transl_table == 4: # Mold, Protozoan, # and Coelenterate Mitochondrial Code # and the Mycoplasma/Spiroplasma Code elif transl_table == 5: # invertebrate mito elif transl_table == 9: # echinoderm mito and now 6, 10, 11, 12, 13, 14, 21. (These are found in p4.GeneticCode.py or in :class:`Code`) *transl_table* may also be provided as text consisting in blank-separated elements. Each elements consists in n characters, where n is the number of defined codons. The first element lists the coded (pseudo-)amino-acids. The second elements describes whether a codon can be a start codon ('M') or not ('-'). The other elements correspond to the (pseudo-)nucleotides at the successive codon positions. Example:: FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG ---M---------------M------------MMMM---------------M------------ TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG """ gm = ['p4.alignment_recoding.pseudoTranslate()'] if self.dataType != 'dna': gm.append("Self should be a DNA alignment") raise P4Error(gm) if code is None: #from GeneticCode import Code code = Code(transl_table, in_type="dna", out_type=out_type).code codelength = Code(transl_table).codelength else: if isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py # We assume that the "codons" have all the same length, # and we look at the first codon in the dictionary to know this length. codelength = len(code.keys()[0]) # We use standard type, because, depending on the code used to make the translation, # we may get something that contains symbols not corresponding to normal amino-acids. out_type = "standard" if self.length % codelength != 0: gm.append("The length of self should be a multiple of %i" % codelength) raise P4Error(gm) ali = self.dupe() ali.dataType = out_type ali.length = self.length / codelength ali.symbols = CAT(sorted(set(code.values()))) ali.equates = {} ali.dim = len(ali.symbols) ali.nexusSets = None ali.parts = [] ali.excludeDelete = None for seq in ali.sequences: # Initialize an all-gap sequence. seq.sequence = ['-'] * ali.length seq.dataType = out_type for i in range(len(self.sequences)): # the original sequence dnaSeq = self.sequences[i].sequence # the future pseudo-translation pseudoProtSeq = ali.sequences[i].sequence for j in range(ali.length): theCodon = dnaSeq[(j * codelength):((j + 1) * codelength)] if theCodon in code: pseudoProtSeq[j] = code[theCodon] elif theCodon == '-' * codelength: # full indel pseudoProtSeq[j] = '-' elif theCodon.count('-'): # partial indel gm.append( " seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) else: # Should we use a CodonTranslationError (defined in code_utils.py) here ? gm.append( " seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) for seq in ali.sequences: # Convert from list to string. #s.sequence = ''.join(s.sequence) seq.sequence = CAT(seq.sequence) #print s.sequence return ali