def recode_sequence(sequence, converter, positions=None, code="Standard"): """uses the correspondence rules provided by the dictionary *converter* to produce a recoded version of *sequence*, and returns it. *positions* determines which codon positions are recoded. By default, all positions are recoded. """ gm = ['p4.code_utils.recode_sequence()'] if isinstance(code, str): code = getBiopythonCode(code) else: msg = "code must be a dictionary, or a string naming the code in Biopython." assert isinstance(code, dict), msg # To get the size of the motifs being substituted, we look at the first one in the dictionary. subst_size = len(converter.keys()[0]) if len(sequence) % subst_size != 0: gm.append("The length of the sequence should be a multiple of %i" % subst_size) raise P4Error(gm) if positions is not None: # Filter the converter. for codon in converter.keys(): convert = converter[codon] # Replace the positions to be recoded by the converted codon, but keep the others. converter[codon] = CAT([ (convert[i - 1] if i in positions else codon[i - 1]) for i in range(1, subst_size + 1) ]) # Build the recoded version of the sequence. new_seq = "" # Loop over the codons (triplets, if subst_size == 3). for i in range(len(sequence) // subst_size): try: # Make a Codon instance (to convert it afterwards). codon = Codon(sequence[(subst_size * i):(subst_size * (i + 1))], code) except CodonTranslationError, e: sys.stderr.write("%s\nProblem at sequence slice %i:%i\n" % (e, subst_size * i, subst_size * (i + 1))) warnings.warn("We will replace the codon by indels.\n") try: codon = Codon("-" * subst_size, code) except CodonTranslationError, e: sys.stderr.write( "We still don't know how to translate the codon. " "Bad implementation?\n") sys.exit(1)
def getDegenerateSitesMask(self, transl_table=1, code=None, all_3rd_positions=False): """This method returns a mask corresponding to sites contributing to codon degeneracy. This is intended to be used for submatrix extraction using the noLRSall3 method, using :meth:`Alignment.getSubsetUsingMask` (with the option *inverse=True*, to get the degeneracy-free sites). If *all_3rd_positions* is set to True, then the mask includes all 3rd codon positions regardless of their effective contribution to codon degeneracy. The matrix is expected to start at a first codon position and stop at a third codon position. *transl_table* is an integer used to determine under which genetic code the codons are to be interpreted. The default value of 1 corresponds to the standard genetic code. Other values can be found in p4.GeneticCode.py Alternatively, the genetic code can be provided directly, using a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. If such a code is provided, the value of *transl_table* is ignored. The name of this method noLRSall3 comes from its effect in the case of the standard genetic code: it discards the sites participating in first position degeneracy for leucine (L) and arginine (R), first and second position degeneracy for serine (S), as well as all third codon positions where degeneracy is observed (or all of them if *all_3rd_positions* is True). Depending on the genetic code used, the type of amino-acid affected could be different. The goal of the submatrix extraction using the produced mask is to remove the sites that could have been affected by composition bias: mutations within a set of synonymous codons are more likely to favour the codons that conform to the general nucleotide composition. However, one could argue that this bias is less likely to have played when the observed codons differ by more than one nucleotide and at least a non-synonymous mutation has to occur to bridge the gap. With the standard genetic code, this occurs for serine codons. Indeed, the minimal mutation paths connecting the serine AGY and TCN codon categories are AGY (serine) <-> TGY (cysteine) <-> TCY (serine) and AGY (serine) <-> ACY (threonine) <-> TCY (serine) The current implementation (as of june 2012) does not check that a mutational path between synonymous codons exists, that consists only in synonymous point mutations. This may be considered as a bug, because you may not want AGY and TCN (or other similar cases that could occur with different genetic codes) to be considered as a single degeneracy continuum. """ gm = ["Alignment.getDegenerateSitesMask()"] if code is None: #code = GeneticCode(transl_table).code # Use the generalized Code class defined in code_utils.py code = Code(transl_table).code n_codons = self.length / 3 mask = "" # Loop over the successive triplets of sites. for c in range(n_codons): # 3 alignment slices. One for each codon position. slices = [self.sequenceSlice((3 * c) + pos - 1) for pos in [1, 2, 3]] # The different codons found for the current triplet of sites. codons = set([ codon.lower() for codon in ["%s%s%s" % nnn for nnn in zip(slices[0], slices[1], slices[2])] ]) # These are not Codon instances, this probably doesn't deal properly with ambiguity codes. # Record the amino-acids coded at the 3 nucleotides site, and the codons used for this aa. aas_codons = {} for codon in codons: # Determine the corresponding amino-acid. if codon == '---': aa = '-' elif codon in code: aa = code[codon] elif 'n' in codon: # This is a simplification. Some "degenerate" codons # can still code an unambiguous amino-acid. aa = 'x' else: gm.append("Codon %s is not defined in the chosen code " "or translation table." % codon) gm.append("%s" % str(code)) raise P4Error(gm) # Record the codon used for the aa. if aa in aas_codons: aas_codons[aa].append(codon) else: aas_codons[aa] = [codon] # Determine which positions in the triplet are degenerate. codon_mask = [False, False, False] # Loop over the recorded amino-acids. for aa in aas_codons.keys(): if len(aas_codons[aa]) > 1: # Several codons have been found at this triplet for the amino-acid aa. # For each position, count the number of different nucleotides # present in the used codons. degeneracy = [ len(set([cod[0] for cod in aas_codons[aa]])), len(set([cod[1] for cod in aas_codons[aa]])), len(set([cod[2] for cod in aas_codons[aa]])) ] if all_3rd_positions: # Put a position in the mask if it is already in the mask # or if it is degenerate, or if it is a 3rd position. codon_mask = [ codon_mask[pos - 1] or (degeneracy[pos - 1] > 1) for pos in [1, 2] ] + [True] else: # Put a position in the mask if it is already in the mask # or if it is degenerate. codon_mask = [ codon_mask[pos - 1] or (degeneracy[pos - 1] > 1) for pos in [1, 2, 3] ] if all(codon_mask): # All positions of the triplet have been found to contribute to # some codon degeneracy somewhere in the alignment. # There is no need to search further. break # Append the codon mask to the mask. mask += CAT(map(lambda b: "1" if b else "0", codon_mask)) return mask
def treeFinderMAPAnalysis(alignment, groups, gamma=True, invariant=True, bootstrap=False, nreplicates=100, remove_files=False, run_analysis=True, verbose=False): """ Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP substitution model for grouped amino-acids. - *alignment*: p4 alignment object of original (un-recoded) protein data from which the "groups" are derived - *groups*: list of grouped amino-acids, possibly resuling from :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()` - *gamma*: include gamma distribution of among-site rate variation - *bootstrap*: run bootstrap analysis - *nreplicates*: number of bootstrap replicates - *invariant*: include a proportion of invariant sites - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the control file - *remove_files*: remove analysis files. Only available if run_analysis=True """ gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"] if not isinstance(alignment, Alignment): msg = "alignment must be a Alignment object" gm.append(msg) raise P4Error(gm) if alignment.dataType != "protein": msg = "alignment should be the original protein data from" + \ "which the groups were defined. Doing nothing." gm.append(msg) raise P4Error(gm) for param in [ gamma, invariant, bootstrap, remove_files, run_analysis, verbose ]: if not isinstance(param, bool): msg = "%s value must be either True or False" % param gm.append(msg) raise P4Error(gm) if not isinstance(nreplicates, int): msg = "nreplictes must be an integer" gm.append(msg) raise P4Error(gm) if run_analysis: if not func.which2("tf"): msg = "tf (treefinder) is not in your $PATH" + \ "Cannot run analysis" gm.append(msg) raise P4Error(gm) datafile_name = "tf_data.phy" #tf commands tls = """ReconstructPhylogeny[ "%(datafile)s", SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s, WithEdgeSupport->%(bootstrap)s%(nreplicates)s ], "%(outfile)s",SaveReport""" od = {} od["datafile"] = datafile_name if gamma: if invariant: od["ifH"] = ":GI[Optimum]" else: od["ifH"] = ":G[Optimum]" else: if invariant: od["ifH"] = ":I[Optimum]" else: od["ifH"] = "" if bootstrap: od["bootstrap"] = "True" od["nreplicates"] = ",NReplicates->%i" % nreplicates else: od["bootstrap"] = "False" od["nreplicates"] = "" od["outfile"] = "tf_reconstruction.output" od["map"] = ",".join( ['"%s"' % i for i in [group.upper() for group in groups]]) if run_analysis: #Write data file alignment.writePhylip(datafile_name) #Write control file tl_file = "tf_control.tl" fh = open(tl_file, "w") fh.write(tls % od) fh.close() if verbose: direct = subprocess.STDOUT else: direct = open("/dev/null", "w") child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True) if verbose: print("Running TreeFinder, this could take some time...", end=' ') sys.stdout.flush() child.communicate() if verbose: print("done.") sys.stdout.flush() #This doesnt seem to work, why? #while child.poll() is None: # time.sleep(60) # if verbose: # sys.stdout.write(".") # sys.stdout.flush() if child.returncode != 0: msg = "TreeFinder returned error code %s" gm.append(msg % (child.returncode)) raise P4Error(gm) fh = open(od["outfile"], "r") line = fh.readlines()[1] fh.close() rd = {} #Likelihood rd["Likelihood"] = float(line[line.index("Likelihood->") + 12:line.index(",")]) #Tree ts = line[line.index("Phylogeny->") + 11:line.index("SubstitutionModel->") - 1] rd["Phylogeny"] = ts #SubstitutionModel sm = line[line.index("SubstitutionModel->") + 19:line.index("OSubstitutionModel->") - 1] rd["SubstitutionModel"] = sm #OSubstitutionModel osm = line[line.index("OSubstitutionModel->") + 20:line.index("OEdgeOptimizationOff->") - 1] rd["OSubstitutionModel"] = osm #NSites ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1] rd["Nsites"] = int(ns) #NParameters np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1] rd["NParameters"] = int(np) #AIC rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") - 1]) #AICc-> rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") - 1]) #HQ rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1]) #BIC rd["BIC"] = float(line[line.index("BIC->") + 5:line.index("Checksum->") - 1]) #LikelihoodTime lt = line[line.index("LikelihoodTime->") + 16:line.index("LikelihoodMemory->") - 1] rd["LikelihoodTime"] = float(lt) #LikelihoodMemory lm = line[line.index("LikelihoodMemory->") + 18:-3] rd["LikelihoodMemory"] = int(lm) #Make a tree object tree = rd["Phylogeny"].replace("{", "(") tree = tree.replace("}", ")") tree = tree.replace("\"", "") tree = tree + ";" if bootstrap: #Tree viewer has the brlen before bootstrap value plus an extra colon # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy" patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)") repl = r")\2:\1" tree = re.sub(patt, repl, tree) origw = var.warnReadNoFile var.warnReadNoFile = False read(tree) var.warnReadNoFile = origw result_tree = var.trees.pop() if bootstrap: #Round up floats to percentages for node in result_tree.iterInternalsNoRoot(): node.name = "%2.f" % float(node.name) if remove_files: os.remove("tf_control.tl") os.remove("tf_data.phy") os.remove("tf_reconstruction.output") if verbose: print("\n") result_tree.draw() print("\nLikelihood: %.4f\n" % rd["Likelihood"]) return result_tree, rd else: print(tls % od) return (None, None)
def pseudoTranslate(self, transl_table=1, out_type="standard", code=None): """Returns a pseudo protein alignment from *self*, a DNA alignment. The result is of datatype standard instead of protein, which allows the use of special recodings, like distinguishing between two types of serines, like in :meth:`Alignment.recode23aa()`. *self* is translated using :attribute:`Code(transl_table).code`. Alternatively, the genetic code can be provided through the parameter *code*. If such a code is provided, the value of *transl_table* is ignored. The parameter *code* can take to types of values: 1) It can be a string naming the code to use, as defined in Biopython's `CodonTable.unambiguous_dna_by_name.keys()` 2) It can be a dictionnary *code* whose keys are codons, and the values are the corresponding amino-acids. All triplets present in the matrix should also be present in the code dictionnary, except triplets of indels. Codons and the corresponding amino-acids are expected to be in lower case. It may be possible to use a code based on another codon length as 3, but this has not been tested as of June 2012. At the moment, we can only do translations where the sequences are phased with the coding frame, ie the first sequence position is the first position of the codon, and the last sequence position should be a last codon position. The default behaviour is to use translation table 1, that is the standard genetic code. Other available translation tables, this week:: if transl_table == 1: # standard elif transl_table == 2: # vertebrate mito elif transl_table == 4: # Mold, Protozoan, # and Coelenterate Mitochondrial Code # and the Mycoplasma/Spiroplasma Code elif transl_table == 5: # invertebrate mito elif transl_table == 9: # echinoderm mito and now 6, 10, 11, 12, 13, 14, 21. (These are found in p4.GeneticCode.py or in :class:`Code`) *transl_table* may also be provided as text consisting in blank-separated elements. Each elements consists in n characters, where n is the number of defined codons. The first element lists the coded (pseudo-)amino-acids. The second elements describes whether a codon can be a start codon ('M') or not ('-'). The other elements correspond to the (pseudo-)nucleotides at the successive codon positions. Example:: FFJJZZZZYY**CC*WBBBBPPPPHHQQUUUUIIIMTTTTNNKKXXOOVVVVAAAADDEEGGGG ---M---------------M------------MMMM---------------M------------ TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG """ gm = ['p4.alignment_recoding.pseudoTranslate()'] if self.dataType != 'dna': gm.append("Self should be a DNA alignment") raise P4Error(gm) if code is None: #from GeneticCode import Code code = Code(transl_table, in_type="dna", out_type=out_type).code codelength = Code(transl_table).codelength else: if isinstance(code, str): code = getBiopythonCode(code) # defined in code_utils.py # We assume that the "codons" have all the same length, # and we look at the first codon in the dictionary to know this length. codelength = len(code.keys()[0]) # We use standard type, because, depending on the code used to make the translation, # we may get something that contains symbols not corresponding to normal amino-acids. out_type = "standard" if self.length % codelength != 0: gm.append("The length of self should be a multiple of %i" % codelength) raise P4Error(gm) ali = self.dupe() ali.dataType = out_type ali.length = self.length / codelength ali.symbols = CAT(sorted(set(code.values()))) ali.equates = {} ali.dim = len(ali.symbols) ali.nexusSets = None ali.parts = [] ali.excludeDelete = None for seq in ali.sequences: # Initialize an all-gap sequence. seq.sequence = ['-'] * ali.length seq.dataType = out_type for i in range(len(self.sequences)): # the original sequence dnaSeq = self.sequences[i].sequence # the future pseudo-translation pseudoProtSeq = ali.sequences[i].sequence for j in range(ali.length): theCodon = dnaSeq[(j * codelength):((j + 1) * codelength)] if theCodon in code: pseudoProtSeq[j] = code[theCodon] elif theCodon == '-' * codelength: # full indel pseudoProtSeq[j] = '-' elif theCodon.count('-'): # partial indel gm.append( " seq %i, position %4i, dnaSeq %4i, codon '%s' is incomplete" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) else: # Should we use a CodonTranslationError (defined in code_utils.py) here ? gm.append( " seq %i position %4i, dnaSeq %4i, codon '%s' is not a known codon" % (i, j, (j * codelength), theCodon)) raise P4Error(gm) for seq in ali.sequences: # Convert from list to string. #s.sequence = ''.join(s.sequence) seq.sequence = CAT(seq.sequence) #print s.sequence return ali