def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get results mutation_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] pair = DirectProtein.get_nt_distribution_and_aa_energies( mutation_distribution, aa_distribution) nt_distribution, aa_energies = pair # write something out = StringIO() # write the stationary nucleotide distribution print >> out, 'nucleotide stationary distribution:' for nt, value in zip(nt_letters, nt_distribution): print >> out, '%s : %s' % (nt, value) print >> out, '' # write the amino acid energies print >> out, 'amino acid energies:' for aa, value in zip(aa_letters, aa_energies): print >> out, '%s : %s' % (aa, value) # return the response return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution nt_to_probability = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_probability = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # convert the dictionaries to lists observed_nt_stationary_distribution = [nt_to_probability[nt] for nt in nt_letters] aa_distribution = [aa_to_probability[aa] for aa in aa_letters] # define the objective function objective_function = MyCodonObjective(aa_distribution, observed_nt_stationary_distribution) initial_stationary_guess = halpern_bruno_nt_estimate(nt_to_probability, aa_to_probability) A, C, G, T = initial_stationary_guess initial_guess = (math.log(C/A), math.log(G/A), math.log(T/A)) iterations = 20 try: best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) except Exception, e: debugging_information = objective_function.get_history() raise HandlingError(str(e) + '\n' + debugging_information)
def get_response_content(fs): # read the nexus data nexus = Nexus.Nexus() try: nexus.load(StringIO(fs.nexus)) except Nexus.NexusError as e: raise HandlingError(e) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions nucleotide_distributions = [] for nt_string in (fs.frequency_a, fs.frequency_b): distribution = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(distribution) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the results return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
def get_response_content(fs): # read the nexus data nexus = Nexus.Nexus() try: nexus.load(StringIO(fs.nexus)) except Nexus.NexusError as e: raise HandlingError(e) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions nucleotide_distributions = [] for nt_string in (fs.frequency_a, fs.frequency_b): distribution = SnippetUtil.get_distribution( nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(distribution) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel( mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the results return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
def get_response_content(fs): # get the nucleotide distribution nt_to_probability = SnippetUtil.get_distribution(fs.nucleotides, "nucleotide", nt_letters) # get the amino acid distribution aa_to_probability = SnippetUtil.get_distribution(fs.aminoacids, "amino acid", aa_letters) # convert the dictionaries to lists observed_nt_stationary_distribution = [nt_to_probability[nt] for nt in nt_letters] aa_distribution = [aa_to_probability[aa] for aa in aa_letters] # define the objective function objective_function = MyCodonObjective(aa_distribution, observed_nt_stationary_distribution) initial_stationary_guess = halpern_bruno_nt_estimate(nt_to_probability, aa_to_probability) A, C, G, T = initial_stationary_guess initial_guess = (math.log(C / A), math.log(G / A), math.log(T / A)) iterations = 20 try: best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) except Exception, e: debugging_information = objective_function.get_history() raise HandlingError(str(e) + "\n" + debugging_information)
def get_response_content(fs): # get the nucleotide distribution nt_distribution = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', Codon.g_nt_letters) # get the amino acid distribution aa_distribution = SnippetUtil.get_distribution(fs.amino_acids, 'amino acid', Codon.g_aa_letters) # Assert that the nucleotide distribution # is compatible with the amino acid distribution. # According to the Halpern-Bruno assumptions, there should be no codon bias. # This means that if a nucleotide has a frequency of zero, # then the amino acid coded by each codon containing that nucleotide # must also have a frequency of zero. msg_a = 'the given amino acid and nucleotide distributions ' msg_b = 'are incompatible with the assumption of no codon bias' err = HandlingError(msg_a + msg_b) for aa, codons in Codon.g_aa_letter_to_codons.items(): for codon in codons: for nt in codon: if aa_distribution[aa] and not nt_distribution[nt]: raise err # get the codon distribution codon_to_weight = {} for codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_distribution[aa] codon_nt_weight = np.prod([nt_distribution[nt] for nt in codon]) sibling_nt_weight_sum = 0 for sibling in sibling_codons: product = np.prod([nt_distribution[nt] for nt in sibling]) sibling_nt_weight_sum += product codon_to_weight[codon] = codon_aa_weight * codon_nt_weight codon_to_weight[codon] /= sibling_nt_weight_sum total_weight = sum(codon_to_weight.values()) # return the codon distribution out = StringIO() for codon, weight in sorted(codon_to_weight.items()): print >> out, codon, ':', weight / total_weight return out.getvalue() + '\n'
def get_response_content(fs): # get the codon distribution codons = Codon.g_sorted_non_stop_codons distribution = SnippetUtil.get_distribution(fs.weights, 'codon', codons) # get the rate matrix defined by the weights and kappa and omega r = RateMatrix.get_gy94_rate_matrix(distribution, fs.kappa, fs.omega) # show the rate matrix in convenient text form out = StringIO() for ca in codons: print >> out, '\t'.join(str(r[(ca, cb)]) for cb in codons) return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution nt_distribution = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', Codon.g_nt_letters) # get the amino acid distribution aa_distribution = SnippetUtil.get_distribution(fs.amino_acids, 'amino acid', Codon.g_aa_letters) # Assert that the nucleotide distribution # is compatible with the amino acid distribution. # According to the Halpern-Bruno assumptions, there should be no codon bias. # This means that if a nucleotide has a frequency of zero, # then the amino acid coded by each codon containing that nucleotide # must also have a frequency of zero. msg_a = 'the given amino acid and nucleotide distributions ' msg_b = 'are incompatible with the assumption of no codon bias' err = HandlingError(msg_a + msg_b) for aa, codons in Codon.g_aa_letter_to_codons.items(): for codon in codons: for nt in codon: if aa_distribution[aa] and not nt_distribution[nt]: raise err # get the codon distribution codon_to_weight = {} for codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_distribution[aa] codon_nt_weight = np.prod([nt_distribution[nt] for nt in codon]) sibling_nt_weight_sum = 0 for sibling in sibling_codons: product = np.prod([nt_distribution[nt] for nt in sibling]) sibling_nt_weight_sum += product codon_to_weight[codon] = codon_aa_weight * codon_nt_weight codon_to_weight[codon] /= sibling_nt_weight_sum total_weight = sum(codon_to_weight.values()) # return the codon distribution out = StringIO() for codon, weight in sorted(codon_to_weight.items()): print >> out, codon, ':', weight / total_weight return out.getvalue() + '\n'
def get_response_content(fs): # get the nucleotide distribution d = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the rate matrix defined by the nucleotide distribution and kappa rate_object = RateMatrix.get_unscaled_hky85_rate_matrix(d, fs.kappa) if fs.scaled: rate_object.normalize() rate_matrix = rate_object.get_dictionary_rate_matrix() # show the rate matrix in convenient text form out = StringIO() for nta in 'ACGT': print >> out, '\t'.join(str(rate_matrix[(nta, ntb)]) for ntb in 'ACGT') return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution d = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the rate matrix defined by the nucleotide distribution and kappa rate_object = RateMatrix.get_unscaled_hky85_rate_matrix(d, fs.kappa) if fs.scaled: rate_object.normalize() rate_matrix = rate_object.get_dictionary_rate_matrix() # show the rate matrix in convenient text form out = StringIO() for nta in 'ACGT': print >> out, '\t'.join(str(rate_matrix[(nta, ntb)]) for ntb in 'ACGT') return out.getvalue()
def get_response_content(fs): # get the mutation process nucleotide distribution nt_distribution = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_ordered) # get the selection process amino acid energies aa_to_energy = SnippetUtil.get_dictionary(fs.aminoacids, 'amino acid', 'energy', aa_ordered) # create the direct protein rate matrix object nt_distribution_list = [nt_distribution[nt] for nt in nt_ordered] aa_energy_list = [aa_to_energy[aa] for aa in aa_ordered] rate_matrix_object = DirectProtein.DirectProteinRateMatrix(fs.kappa, nt_distribution_list, aa_energy_list) # write the response out = StringIO() if fs.srm: # write the scaled rate matrix rate_matrix_object.normalize() row_major_rate_matrix = rate_matrix_object.get_row_major_rate_matrix() print >> out, MatrixUtil.m_to_string(row_major_rate_matrix) elif fs.urm: # write the unscaled rate matrix row_major_rate_matrix = rate_matrix_object.get_row_major_rate_matrix() print >> out, MatrixUtil.m_to_string(row_major_rate_matrix) elif fs.cstat: # write the codon stationary distribution codon_distribution = rate_matrix_object.get_codon_distribution() for codon in codons_ordered: print >> out, codon, ':', codon_distribution[codon] elif fs.astat: # write the amino acid stationary distribution aa_distribution = rate_matrix_object.get_aa_distribution() for aa in aa_ordered: print >> out, aa, ':', aa_distribution[aa] elif fs.nstat: # write the nucleotide stationary distribution nt_distribution = rate_matrix_object.get_nt_distribution() for nt in nt_ordered: print >> out, nt, ':', nt_distribution[nt] elif fs.sf: # write the rate matrix scaling factor print >> out, rate_matrix_object.get_expected_rate() # return the response return out.getvalue() + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide distribution distribution = SnippetUtil.get_distribution( fs.weights, 'nucleotide', list('ACGT')) # get the nucleotide alignment try: alignment = Fasta.Alignment(StringIO(fs.alignment)) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the rate matrix defined by the nucleotide distribution and kappa row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, fs.kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix( row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle rates mle_rates = get_mle_rates(tree, alignment, rate_matrix) # return the response return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide distribution distribution = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the nucleotide alignment try: alignment = Fasta.Alignment(StringIO(fs.alignment)) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the rate matrix defined by the nucleotide distribution and kappa row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, fs.kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle rates mle_rates = get_mle_rates(tree, alignment, rate_matrix) # return the response return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def get_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # parse the tree try: tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() except Newick.NewickSyntaxError as e: raise HandlingError(str(e)) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions frequency_strings = (fs.frequency_a, fs.frequency_b) nucleotide_distributions = [] for nt_string in frequency_strings: d = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(d) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # simulate the alignment try: alignment = PhyLikelihood.simulate_alignment(tree, mixture_model, fs.ncols) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the output string output_string = '' if fs.fasta: # the output is the alignment arr = [] for node in tree.gen_tips(): arr.append(alignment.get_fasta_sequence(node.name)) alignment_string = '\n'.join(arr) output_string = alignment_string elif fs.nex: # the output is the alignment and the tree nexus = Nexus.Nexus() nexus.tree = tree nexus.alignment = alignment for i in range(2): arr = [] arr.append('weight: %s' % mixture_weights[i]) arr.append('kappa: %s' % kappa_values[i]) nexus.add_comment('category %d: %s' % (i + 1, ', '.join(arr))) output_string = str(nexus) # define the filename if fs.fasta: filename_extension = 'fasta' elif fs.nex: filename_extension = 'nex' filename = 'sample.' + fs.fmt #TODO use the correct filename extension in the output return output_string
def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get distributions in convenient list form stationary_nt_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] codon_distribution = [] implied_stationary_nt_distribution = [] if fs.corrected: # define the objective function objective_function = MyObjective(aa_distribution, stationary_nt_distribution) initial_guess = (0, 0, 0) iterations = 20 best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies( best_mutation_distribution, aa_distribution) implied_stationary_nt_distribution, result_aa_energies = result # Get the codon distribution; # kappa doesn't matter because we are only concerned # with stationary distributions kappa = 1.0 dpm = DirectProtein.DirectProteinRateMatrix( kappa, best_mutation_distribution, result_aa_energies) codon_distribution = dpm.get_stationary_distribution() elif fs.hb: # get the codon distribution unnormalized_codon_distribution = [] for codon in codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_to_weight[aa] codon_nt_weight = np.prod([nt_to_weight[nt] for nt in codon]) sibling_nt_weight_sum = sum( np.prod([nt_to_weight[nt] for nt in sibling]) for sibling in sibling_codons) weight = codon_aa_weight * codon_nt_weight weight /= sibling_nt_weight_sum unnormalized_codon_distribution.append(weight) codon_distribution = normalized(unnormalized_codon_distribution) nt_to_weight = dict(zip(nt_letters, [0] * 4)) for codon, p in zip(codons, codon_distribution): for nt in codon: nt_to_weight[nt] += p implied_stationary_nt_distribution = normalized(nt_to_weight[nt] for nt in nt_letters) # start the output text string out = StringIO() # write the codon stationary distribution print >> out, 'estimated codon stationary distribution:' for codon, p in zip(codons, codon_distribution): print >> out, '%s : %s' % (codon, p) print >> out, '' # write the nucleotide stationary distribution print >> out, 'implied nucleotide stationary distribution:' for nt, p in zip(nt_letters, implied_stationary_nt_distribution): print >> out, '%s : %s' % (nt, p) # return the response return out.getvalue()
def get_response_content(fs): # get the nucleotide distribution nt_to_weight = SnippetUtil.get_distribution(fs.nucleotides, 'nucleotide', nt_letters) # get the amino acid distribution aa_to_weight = SnippetUtil.get_distribution(fs.aminoacids, 'amino acid', aa_letters) # get distributions in convenient list form stationary_nt_distribution = [nt_to_weight[nt] for nt in nt_letters] aa_distribution = [aa_to_weight[aa] for aa in aa_letters] codon_distribution = [] implied_stationary_nt_distribution = [] if fs.corrected: # define the objective function objective_function = MyObjective(aa_distribution, stationary_nt_distribution) initial_guess = (0, 0, 0) iterations = 20 best = scipy.optimize.nonlin.broyden2(objective_function, initial_guess, iterations) x, y, z = best best_mutation_weights = (1, math.exp(x), math.exp(y), math.exp(z)) best_mutation_distribution = normalized(best_mutation_weights) # Given the mutation distribution and the amino acid distribution, # get the stationary distribution. result = DirectProtein.get_nt_distribution_and_aa_energies( best_mutation_distribution, aa_distribution) implied_stationary_nt_distribution, result_aa_energies = result # Get the codon distribution; # kappa doesn't matter because we are only concerned # with stationary distributions kappa = 1.0 dpm = DirectProtein.DirectProteinRateMatrix( kappa, best_mutation_distribution, result_aa_energies) codon_distribution = dpm.get_stationary_distribution() elif fs.hb: # get the codon distribution unnormalized_codon_distribution = [] for codon in codons: aa = Codon.g_codon_to_aa_letter[codon] sibling_codons = Codon.g_aa_letter_to_codons[aa] codon_aa_weight = aa_to_weight[aa] codon_nt_weight = np.prod([nt_to_weight[nt] for nt in codon]) sibling_nt_weight_sum = sum(np.prod([nt_to_weight[nt] for nt in sibling]) for sibling in sibling_codons) weight = codon_aa_weight * codon_nt_weight weight /= sibling_nt_weight_sum unnormalized_codon_distribution.append(weight) codon_distribution = normalized(unnormalized_codon_distribution) nt_to_weight = dict(zip(nt_letters, [0]*4)) for codon, p in zip(codons, codon_distribution): for nt in codon: nt_to_weight[nt] += p implied_stationary_nt_distribution = normalized(nt_to_weight[nt] for nt in nt_letters) # start the output text string out = StringIO() # write the codon stationary distribution print >> out, 'estimated codon stationary distribution:' for codon, p in zip(codons, codon_distribution): print >> out, '%s : %s' % (codon, p) print >> out, '' # write the nucleotide stationary distribution print >> out, 'implied nucleotide stationary distribution:' for nt, p in zip(nt_letters, implied_stationary_nt_distribution): print >> out, '%s : %s' % (nt, p) # return the response return out.getvalue()
def get_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # parse the tree try: tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() except Newick.NewickSyntaxError as e: raise HandlingError(str(e)) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions frequency_strings = (fs.frequency_a, fs.frequency_b) nucleotide_distributions = [] for nt_string in frequency_strings: d = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(d) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel( mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # simulate the alignment try: alignment = PhyLikelihood.simulate_alignment( tree, mixture_model, fs.ncols) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the output string output_string = '' if fs.fasta: # the output is the alignment arr = [] for node in tree.gen_tips(): arr.append(alignment.get_fasta_sequence(node.name)) alignment_string = '\n'.join(arr) output_string = alignment_string elif fs.nex: # the output is the alignment and the tree nexus = Nexus.Nexus() nexus.tree = tree nexus.alignment = alignment for i in range(2): arr = [] arr.append('weight: %s' % mixture_weights[i]) arr.append('kappa: %s' % kappa_values[i]) nexus.add_comment('category %d: %s' % (i+1, ', '.join(arr))) output_string = str(nexus) # define the filename if fs.fasta: filename_extension = 'fasta' elif fs.nex: filename_extension = 'nex' filename = 'sample.' + fs.fmt #TODO use the correct filename extension in the output return output_string