def main(): # create the alignment object print 'creating the alignment...' alignment_string = Fasta.brown_example_alignment.strip() alignment = Fasta.Alignment(StringIO(alignment_string)) # create a tree object print 'creating the tree...' tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) # create a rate matrix object print 'creating the rate matrix object...' distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25} kappa = 2.0 row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle_rates print 'getting the mle rates...' mle_rates = get_mle_rates(tree, alignment, rate_matrix) print 'mle rates:' print mle_rates print 'stockholm string:' print get_stockholm_string(tree, alignment, mle_rates)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # simulate the ancestral alignment try: alignment = PhyLikelihood.simulate_ancestral_alignment( tree, alignment, rate_matrix_object) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the alignment string using an ordering defined by the tree arr = [] for node in tree.preorder(): arr.append(alignment.get_fasta_sequence(node.name)) # return the response return '\n'.join(arr) + '\n'
def __call__(self, X_logs): """ The vth entry of X corresponds to the log rate of the branch above v. Return the quantity to be minimized (the neg log likelihood). @param X: vector of branch rate logs @return: negative log likelihood """ X = [math.exp(x) for x in X_logs] B_subs = {} for v_parent, v_child in self.R: edge = frozenset([v_parent, v_child]) r = X[v_child] t = self.B[edge] B_subs[edge] = r * t newick_string = FtreeIO.RBN_to_newick(self.R, B_subs, self.N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) # define the rate matrix object; horrible dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) # get the log likelihood ll = PhyLikelihood.get_log_likelihood( tree, self.alignment, rate_matrix_object) return -ll
def demo_rejection_sampling(): path_length = 2 jukes_cantor_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() states = 'ACGT' n = 100000 nielsen_event_count = 0 nielsen_path_count = 0 nielsen_first_time_sum = 0 nielsen_dwell = dict((c, 0) for c in states) rejection_event_count = 0 rejection_path_count = 0 rejection_first_time_sum = 0 rejection_dwell = dict((c, 0) for c in states) for i in range(n): initial_state = 'A' terminal_state = 'C' events = get_rejection_sample(initial_state, terminal_state, states, path_length, jukes_cantor_rate_matrix) if events is not None: assert events rejection_path_count += 1 rejection_event_count += len(events) t, state = events[0] rejection_first_time_sum += t extended = [(0, initial_state)] + events + [(path_length, terminal_state)] for (t0, state0), (t1, state1) in zip(extended[:-1], extended[1:]): rejection_dwell[state0] += t1 - t0 events = get_nielsen_sample(initial_state, terminal_state, states, path_length, jukes_cantor_rate_matrix) if events is not None: assert events nielsen_path_count += 1 nielsen_event_count += len(events) t, state = events[0] nielsen_first_time_sum += t extended = [(0, initial_state)] + events + [(path_length, terminal_state)] for (t0, state0), (t1, state1) in zip(extended[:-1], extended[1:]): nielsen_dwell[state0] += t1 - t0 expected_fraction = RateMatrix.get_jukes_cantor_transition_matrix(path_length)[(initial_state, terminal_state)] print 'testing the rejection sampling:' print 'expected fraction:', expected_fraction print 'observed fraction:', rejection_path_count / float(n) print 'comparing rejection sampling and nielsen sampling:' rejection_method_fraction = rejection_event_count / float(rejection_path_count) nielsen_method_fraction = nielsen_event_count / float(nielsen_path_count) print 'rejection method fraction:', rejection_method_fraction print 'nielsen method fraction:', nielsen_method_fraction print 'comparing time of first event:' print 'rejection method first event time mean:', rejection_first_time_sum / float(rejection_path_count) print 'nielsen method first event time mean:', nielsen_first_time_sum / float(nielsen_path_count) print 'comparing the duration spent in each state:' print 'rejection:' for state, t in rejection_dwell.items(): print '\t%s: %f' % (state, t/float(rejection_path_count)) print 'nielsen:' for state, t in nielsen_dwell.items(): print '\t%s: %f' % (state, t/float(nielsen_path_count))
def get_response_content(fs): # get a properly formatted newick tree with branch lengths tree = Newick.parse(fs.tree, SpatialTree.SpatialTree) tree.assert_valid() if tree.has_negative_branch_lengths(): msg = 'drawing a tree with negative branch lengths is not implemented' raise HandlingError(msg) tree.add_branch_lengths() # get the dictionary mapping the branch name to the nucleotide name_to_nucleotide = {} # parse the column string for line in iterutils.stripped_lines(fs.column.splitlines()): name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line) if nucleotide_string not in list('acgtACGT'): msg = '"%s" is not a valid nucleotide' % nucleotide_string raise HandlingError(msg) nucleotide_string = nucleotide_string.upper() if name_string in name_to_nucleotide: raise HandlingError('the name "%s" was duplicated' % name_string) name_to_nucleotide[name_string] = nucleotide_string # augment the tips with the nucleotide letters for name, nucleotide in name_to_nucleotide.items(): try: node = tree.get_unique_node(name) except Newick.NewickSearchError as e: raise HandlingError(e) if node.children: msg = 'constraints on internal nodes are not implemented' raise HandlingError(msg) node.state = nucleotide # get the Jukes-Cantor rate matrix object dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # simulate the ancestral nucleotides rate_matrix_object.simulate_ancestral_states(tree) # simulate a path on each branch # this breaks up the branch into a linear sequence of nodes and adds color for node in tree.gen_non_root_nodes(): simulate_branch_path(tree, node) # do the layout EqualArcLayout.do_layout(tree) # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] return DrawTreeImage.get_tree_image(tree, (640, 480), ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def test_simulation(self): tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;' # Parse the example tree. tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Simulate ancestral states. simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)
def gen_distance_matrices(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs . The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. @param count: the requested number of distance matrices @param max_steps: an upper bound on the allowed number of steps """ # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # record the requested number of samples self.requested_matrix_count = count # do some rejection sampling while True: if self.get_complexity() >= max_steps: break if self.accepted_sample_count >= count: break # simulate an alignment from the tree alignment = PhyLikelihood.simulate_alignment( self.tree, model, self.sequence_length) # extract the ordered list of sequences from the alignment object name_to_sequence = dict(zip(alignment.headers, alignment.sequences)) sequence_list = [ name_to_sequence[name] for name in self.ordered_names ] # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies has_zero_off_diagonal = False has_inf_off_diagonal = False for i, row in enumerate(distance_matrix): for j, value in enumerate(row): if i != j: if value == 0.0: has_zero_off_diagonal = True if value == float('inf'): has_inf_off_diagonal = True if has_zero_off_diagonal: self.rejected_zero_sample_count += 1 elif has_inf_off_diagonal: self.rejected_inf_sample_count += 1 else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def test_likelihood(self): # Parse the example tree. tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Calculate the log likelihood. log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object) self.assertAlmostEqual(log_likelihood, -4146.26547208)
def test_jukes_cantor_rejection(self): path_length = 1 jukes_cantor_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() states = 'ACGT' n = 200 observed = 0 for i in range(n): events = get_rejection_sample('A', 'C', states, path_length, jukes_cantor_rate_matrix) if events is not None: observed += 1 p = RateMatrix.get_jukes_cantor_transition_matrix(path_length)[('A', 'C')] expected = n*p variance = n*p*(1-p) errstr = 'observed: %f expected: %f' % (observed, expected) self.failUnless(abs(observed - expected) < 3*math.sqrt(variance), errstr)
def simulate_branch_path(tree, node): """ Simulate the nucleotide history on the path between a node and its parent. This simulated path is conditional on known values at each node. Purines are red; pyrimidines are blue. A and T are brighter; G and C are darker. @param tree: a SpatialTree with simulated nucleotides at each node @param node: the node that defines the branch on which to simulate a history """ nucleotide_to_color = { 'A':'FF4444', 'G':'FF8888', 'T':'4444FF', 'C':'8888FF'} node.branch_color = nucleotide_to_color[node.state] rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() initial_state = node.parent.state terminal_state = node.state states = 'ACGT' events = None while events is None: events = PathSampler.get_nielsen_sample( initial_state, terminal_state, states, node.blen, rate_matrix) parent = node.parent last_t = 0 for t, state in events: new = SpatialTree.SpatialTreeNode() new.name = node.name new.state = state new.branch_color = nucleotide_to_color[parent.state] tree.insert_node(new, parent, node, (t - last_t) / float(node.blen)) last_t = t parent = new
def main(): # create the alignment object print 'creating the alignment...' alignment_string = Fasta.brown_example_alignment.strip() alignment = Fasta.Alignment(StringIO(alignment_string)) # create a tree object print 'creating the tree...' tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) # create a rate matrix object print 'creating the rate matrix object...' distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25} kappa = 2.0 row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix( row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle_rates print 'getting the mle rates...' mle_rates = get_mle_rates(tree, alignment, rate_matrix) print 'mle rates:' print mle_rates print 'stockholm string:' print get_stockholm_string(tree, alignment, mle_rates)
def deserialize_mixture_model(xml_string): """ Convert the xml string to a mixture model. @param xml_string: an xml string defining the mixture model @return: an unscaled mixture model object """ # define the variables that define the model kappa = None category_weights = [] nt_dicts = [] # get the variables that define the model element_tree = ET.parse(StringIO(xml_string)) root = element_tree.getroot() kappa = float(root.get("kappa")) for category in root: category_weights.append(float(category.get("weight"))) distribution = category.find("distribution") nt_dict = {} for terminal in distribution: nt_dict[terminal.get("symbol")] = float(terminal.get("weight")) total = sum(nt_dict.values()) for nt in nt_dict: nt_dict[nt] /= total nt_dicts.append(nt_dict) # create a mixture model from the variables that define the model rate_matrix_objects = [] for nt_dict in nt_dicts: rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(nt_dict, kappa) rate_matrix_objects.append(rate_matrix_object) total = float(sum(category_weights)) category_distribution = [weight / total for weight in category_weights] mixture_model = SubModel.MixtureModel(category_distribution, rate_matrix_objects) mixture_model.normalize() return mixture_model
def simulate_branch_path(tree, node): """ Simulate the nucleotide history on the path between a node and its parent. This simulated path is conditional on known values at each node. Purines are red; pyrimidines are blue. A and T are brighter; G and C are darker. @param tree: a SpatialTree with simulated nucleotides at each node @param node: the node that defines the branch on which to simulate a history """ nucleotide_to_color = { 'A': 'FF4444', 'G': 'FF8888', 'T': '4444FF', 'C': '8888FF' } node.branch_color = nucleotide_to_color[node.state] rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() initial_state = node.parent.state terminal_state = node.state states = 'ACGT' events = None while events is None: events = PathSampler.get_nielsen_sample(initial_state, terminal_state, states, node.blen, rate_matrix) parent = node.parent last_t = 0 for t, state in events: new = SpatialTree.SpatialTreeNode() new.name = node.name new.state = state new.branch_color = nucleotide_to_color[parent.state] tree.insert_node(new, parent, node, (t - last_t) / float(node.blen)) last_t = t parent = new
def create_rate_matrix(distribution, kappa, f): """ The parameter f does not affect the stationary distribution. @param distribution: a dictionary mapping a nucleotide to its frequency @param kappa: the transition / transversion substitution rate ratio @param f: a WAG-like parameter between zero and one @return: a nucleotide rate matrix object """ assert len(distribution) == 4 assert set(distribution) == set('ACGT') assert abs(sum(distribution.values()) - 1.0) < .0000001 # Create the off-diagonal elements of the unscaled rate matrix. rate_matrix = {} for na, pa in distribution.items(): for nb, pb in distribution.items(): if na != nb: if f == 1: rate = pb else: rate = (pb**f) / (pa**(1-f)) if na+nb in ('AG', 'GA', 'CT', 'TC'): rate *= kappa rate_matrix[(na, nb)] = rate # Create the diagonal elements # such that each row in the rate matrix sums to zero. for na in distribution: rate = sum(rate_matrix[(na, nb)] for nb in distribution if nb != na) rate_matrix[(na, na)] = -rate # Convert the dictionary rate matrix to a row major rate matrix ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) return rate_matrix_object
def test_hky_nielsen(self): """ Give modified rejection sampling a chance to fail. It should give the same results as vanilla rejection sampling. """ distribution = {'A':.2,'C':.3,'G':.3,'T':.2} kappa = 2 rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa) rate_matrix_object.normalize() rate_matrix = rate_matrix_object.get_dictionary_rate_matrix() path_length = 2 initial_state = 'A' terminal_state = 'C' states = 'ACGT' iterations = 200 rejection_changes = [] i = 0 while i < iterations: rejection_events = get_rejection_sample(initial_state, terminal_state, states, path_length, rate_matrix) if rejection_events is not None: rejection_changes.append(len(rejection_events)) i += 1 nielsen_changes = [] i = 0 while i < iterations: nielsen_events = get_nielsen_sample(initial_state, terminal_state, states, path_length, rate_matrix) if nielsen_events is not None: nielsen_changes.append(len(nielsen_events)) i += 1 t, p = scipy.stats.mannwhitneyu(rejection_changes, nielsen_changes) self.failIf(p < .001)
def get_response_content(fs): # read the nexus data nexus = Nexus.Nexus() try: nexus.load(StringIO(fs.nexus)) except Nexus.NexusError as e: raise HandlingError(e) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions nucleotide_distributions = [] for nt_string in (fs.frequency_a, fs.frequency_b): distribution = SnippetUtil.get_distribution( nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(distribution) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel( mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the results return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
def test_hky_uniformization(self): """ Give uniformization a chance to fail. It should give the same results as modified rejection sampling. """ distribution = {'A':.2,'C':.3,'G':.3,'T':.2} kappa = 2 rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa) rate_matrix_object.normalize() rate_matrix = rate_matrix_object.get_dictionary_rate_matrix() path_length = 2 initial_state = 'A' terminal_state = 'C' states = 'ACGT' iterations = 200 # get the modified rejection sampling changes, where each change is the number of events on a sampled path nielsen_changes = [] i = 0 while i < iterations: nielsen_events = get_nielsen_sample(initial_state, terminal_state, states, path_length, rate_matrix) if nielsen_events is not None: nielsen_changes.append(len(nielsen_events)) i += 1 # get the uniformization changes, where each change is the number of events on a sampled path uniformization_changes = [] for i in range(iterations): uniformization_events = get_uniformization_sample(initial_state, terminal_state, states, path_length, rate_matrix) uniformization_changes.append(len(uniformization_events)) # see if there is a statistically significant difference between the sampled path lengths #print sum(nielsen_changes) #print sum(uniformization_changes) t, p = scipy.stats.mannwhitneyu(uniformization_changes, nielsen_changes) self.failIf(p < .001, p)
def get_sample_mixture_model(): """ @return: a mixture model that is used to generate the default nexus data """ # define the model kappa = 2 category_distribution = [.1, .4, .5] nt_dicts = [{ 'A': .1, 'C': .4, 'G': .4, 'T': .1 }, { 'A': .2, 'C': .3, 'G': .3, 'T': .2 }, { 'A': .25, 'C': .25, 'G': .25, 'T': .25 }] # create a mixture model from the variables that define the model rate_matrix_objects = [] for nt_dict in nt_dicts: rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_dict, kappa) rate_matrix_objects.append(rate_matrix_object) mixture_model = SubModel.MixtureModel(category_distribution, rate_matrix_objects) mixture_model.normalize() return mixture_model
def get_response_content(fs): # read the nexus data nexus = Nexus.Nexus() try: nexus.load(StringIO(fs.nexus)) except Nexus.NexusError as e: raise HandlingError(e) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions nucleotide_distributions = [] for nt_string in (fs.frequency_a, fs.frequency_b): distribution = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(distribution) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the results return do_analysis(mixture_model, nexus.alignment, nexus.tree) + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the mixture weights weights = [fs.weight_a, fs.weight_b, fs.weight_c] # get the matrices matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c] for R in matrices: if R.shape != (4, 4): msg = 'expected each nucleotide rate matrix to be 4x4' raise HandlingError(msg) # get the nucleotide alignment try: alignment = Fasta.Alignment(fs.alignment.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # create the mixture proportions weight_sum = sum(weights) mixture_proportions = [weight / weight_sum for weight in weights] # create the rate matrix objects ordered_states = list('ACGT') rate_matrix_objects = [] for R in matrices: rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) rate_matrix_objects.append(rate_matrix_object) # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the html string return do_analysis(mixture_model, alignment, tree) + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) # simulate the ancestral alignment try: alignment = PhyLikelihood.simulate_ancestral_alignment( tree, alignment, rate_matrix_object) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the alignment string using an ordering defined by the tree arr = [] for node in tree.preorder(): arr.append(alignment.get_fasta_sequence(node.name)) # return the response return '\n'.join(arr) + '\n'
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() != 2: raise HandlingError('expected a sequence pair') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(fs.states.splitlines()) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the objective function objective = Objective(alignment.sequences, rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # write the response out = StringIO() print >> out, 'maximum likelihood distance:', mle_distance #distances = (mle_distance, 0.2, 2.0, 20.0) #for distance in distances: #print >> out, 'f(%s): %s' % (distance, objective(distance)) return out.getvalue()
def create_rate_matrix(kappa, nt_distribution): """ @param kappa: adjusts for the transition rate differing from the transversion rate @param nt_distribution: ordered ACGT nucleotide probabilities @return: a rate matrix object with one expected nucleotide substitution per time unit """ # make some assertions about the distribution for p in nt_distribution: assert p >= 0 assert len(nt_distribution) == 4 assert RateMatrix.almost_equal(sum(nt_distribution), 1.0) # define some intermediate variables A, C, G, T = nt_distribution R = float(A + G) Y = float(C + T) # make some more assertions about the distribution and about kappa assert A + G > 0 assert C + T > 0 assert kappa > max(-Y, -R) # get the normalization constant normalization_constant = 4 * T * C * (1 + kappa / Y) + 4 * A * G * ( 1 + kappa / R) + 4 * Y * R # adjust the normalization constant to correct what might be an error in the paper normalization_constant /= 2 # define the dictionary rate matrix dict_rate_matrix = {} for source_index, source in enumerate('ACGT'): for sink_index, sink in enumerate('ACGT'): key = (source, sink) coefficient = 1.0 if key in g_transitions: coefficient = 1 + kappa / (nt_distribution[source_index] + nt_distribution[sink_index]) dict_rate_matrix[key] = coefficient * nt_distribution[ sink_index] / normalization_constant for source in 'ACGT': dict_rate_matrix[(source, source)] = -sum(dict_rate_matrix[(source, sink)] for sink in 'ACGT' if source != sink) # convert the dictionary rate matrix to a row major rate matrix row_major = MatrixUtil.dict_to_row_major(dict_rate_matrix, 'ACGT', 'ACGT') # return the rate matrix object rate_matrix_object = RateMatrix.RateMatrix(row_major, 'ACGT') expected_rate = rate_matrix_object.get_expected_rate() if not RateMatrix.almost_equal(expected_rate, 1.0): assert False, 'the rate is %f but should be 1.0' % expected_rate return rate_matrix_object
def get_response_content(fs): # read the matrix from the form data R = fs.matrix # get the stationary distribution of the rate matrix try: v = RateMatrix.get_stationary_distribution(R.tolist()) except RateMatrix.RateMatrixError as e: msg = 'error calculating the stationary distribution: ' + str(e) raise HandlingError(msg) # for each pair of entries, check the detailed balance equation table_rows = [] for i, pi_i in enumerate(v): for j, pi_j in enumerate(v): r_ij = R[i][j] r_ji = R[j][i] if pi_i * r_ij != pi_j * r_ji: row = [] row.append(abs(math.log(pi_i * r_ij) - math.log(pi_j * r_ji))) row.extend([pi_i, pi_j, r_ij, r_ji]) table_rows.append(row) # write some stuff out = StringIO() if table_rows: # get the detailed balance html rows detailed_balance_rows = [] for row in reversed(list(sorted(table_rows))): detailed_balance_rows.append(''.join('<td>' + str(value) + '</td>' for value in row)) # get the header row header_entries = [] header_entries.append( 'abs(log(π<sub>i</sub>r<sub>ij</sub>)-log(π<sub>j</sub>r<sub>ji</sub>))' ) header_entries.append('π<sub>i</sub>') header_entries.append('π<sub>j</sub>') header_entries.append('r<sub>ij</sub>') header_entries.append('r<sub>ji</sub>') header_row = ''.join('<th>%s</th>' % entry for entry in header_entries) # show detailed balance equation results print >> out, '<p>' print >> out, 'This table shows each state pair for which the detailed balance equation is not satisfied exactly.' print >> out, '</p>' print >> out, '<html>' print >> out, '<body>' print >> out, '<table>' print >> out, '<tr>' + header_row + '</tr>' for row in detailed_balance_rows: print >> out, '<tr>' + row + '</tr>' print >> out, '</table>' print >> out, '</body>' print >> out, '</html>' else: print >> out, '<html><body>' print >> out, 'All detailed balance equations are satisfied for this rate matrix.' print >> out, '</body></html>' # return the response return out.getvalue()
def create_rate_matrix(kappa, nt_distribution): """ @param kappa: adjusts for the transition rate differing from the transversion rate @param nt_distribution: ordered ACGT nucleotide probabilities @return: a rate matrix object with one expected nucleotide substitution per time unit """ # make some assertions about the distribution for p in nt_distribution: assert p >= 0 assert len(nt_distribution) == 4 assert RateMatrix.almost_equal(sum(nt_distribution), 1.0) # define some intermediate variables A, C, G, T = nt_distribution R = float(A + G) Y = float(C + T) # make some more assertions about the distribution and about kappa assert A+G > 0 assert C+T > 0 assert kappa > max(-Y, -R) # get the normalization constant normalization_constant = 4*T*C*(1 + kappa/Y) + 4*A*G*(1 + kappa/R) + 4*Y*R # adjust the normalization constant to correct what might be an error in the paper normalization_constant /= 2 # define the dictionary rate matrix dict_rate_matrix = {} for source_index, source in enumerate('ACGT'): for sink_index, sink in enumerate('ACGT'): key = (source, sink) coefficient = 1.0 if key in g_transitions: coefficient = 1 + kappa / (nt_distribution[source_index] + nt_distribution[sink_index]) dict_rate_matrix[key] = coefficient * nt_distribution[sink_index] / normalization_constant for source in 'ACGT': dict_rate_matrix[(source, source)] = -sum(dict_rate_matrix[(source, sink)] for sink in 'ACGT' if source != sink) # convert the dictionary rate matrix to a row major rate matrix row_major = MatrixUtil.dict_to_row_major(dict_rate_matrix, 'ACGT', 'ACGT') # return the rate matrix object rate_matrix_object = RateMatrix.RateMatrix(row_major, 'ACGT') expected_rate = rate_matrix_object.get_expected_rate() if not RateMatrix.almost_equal(expected_rate, 1.0): assert False, 'the rate is %f but should be 1.0' % expected_rate return rate_matrix_object
def get_response_content(fs): # read the matrix from the form data R = fs.matrix # get the stationary distribution of the rate matrix try: v = RateMatrix.get_stationary_distribution(R.tolist()) except RateMatrix.RateMatrixError as e: msg = 'error calculating the stationary distribution: ' + str(e) raise HandlingError(msg) # return the stationary distribution string return '\n'.join(str(x) for x in v) + '\n'
def get_response_content(fs): # get the codon distribution codons = Codon.g_sorted_non_stop_codons distribution = SnippetUtil.get_distribution(fs.weights, 'codon', codons) # get the rate matrix defined by the weights and kappa and omega r = RateMatrix.get_gy94_rate_matrix(distribution, fs.kappa, fs.omega) # show the rate matrix in convenient text form out = StringIO() for ca in codons: print >> out, '\t'.join(str(r[(ca, cb)]) for cb in codons) return out.getvalue()
def get_response_content(fs): # read the matrix from the form data R = fs.matrix n = len(R) # convert the row major rate matrix to a rate matrix object arbitrary_states = [str(x) for x in range(n)] rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), arbitrary_states) rate_matrix_object.normalize() normalized_row_major = rate_matrix_object.get_row_major_rate_matrix() # return the rate matrix return MatrixUtil.m_to_string(normalized_row_major) + '\n'
def get_response_content(fs): # read the matrix from the form data R = fs.matrix # get the stationary distribution of the rate matrix try: v = RateMatrix.get_stationary_distribution(R.tolist()) except RateMatrix.RateMatrixError as e: msg = 'error calculating the stationary distribution: ' + str(e) raise HandlingError(msg) # for each pair of entries, check the detailed balance equation table_rows = [] for i, pi_i in enumerate(v): for j, pi_j in enumerate(v): r_ij = R[i][j] r_ji = R[j][i] if pi_i*r_ij != pi_j*r_ji: row = [] row.append(abs(math.log(pi_i * r_ij) - math.log(pi_j * r_ji))) row.extend([pi_i, pi_j, r_ij, r_ji]) table_rows.append(row) # write some stuff out = StringIO() if table_rows: # get the detailed balance html rows detailed_balance_rows = [] for row in reversed(list(sorted(table_rows))): detailed_balance_rows.append(''.join('<td>' + str(value) + '</td>' for value in row)) # get the header row header_entries = [] header_entries.append('abs(log(π<sub>i</sub>r<sub>ij</sub>)-log(π<sub>j</sub>r<sub>ji</sub>))') header_entries.append('π<sub>i</sub>') header_entries.append('π<sub>j</sub>') header_entries.append('r<sub>ij</sub>') header_entries.append('r<sub>ji</sub>') header_row = ''.join('<th>%s</th>' % entry for entry in header_entries) # show detailed balance equation results print >> out, '<p>' print >> out, 'This table shows each state pair for which the detailed balance equation is not satisfied exactly.' print >> out, '</p>' print >> out, '<html>' print >> out, '<body>' print >> out, '<table>' print >> out, '<tr>' + header_row + '</tr>' for row in detailed_balance_rows: print >> out, '<tr>' + row + '</tr>' print >> out, '</table>' print >> out, '</body>' print >> out, '</html>' else: print >> out, '<html><body>' print >> out, 'All detailed balance equations are satisfied for this rate matrix.' print >> out, '</body></html>' # return the response return out.getvalue()
def demo_uniformization(): distribution = {'A':.2,'C':.3,'G':.3,'T':.2} kappa = 2 rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(distribution, kappa) rate_matrix_object.normalize() rate_matrix = rate_matrix_object.get_dictionary_rate_matrix() path_length = 2 initial_state = 'A' terminal_state = 'C' states = 'ACGT' uniformization_events = get_uniformization_sample(initial_state, terminal_state, states, path_length, rate_matrix) print uniformization_events
def get_response_content(fs): # read the matrix from the form data R = fs.matrix # get the expected rate states = range(len(R)) try: rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), states) expected_rate = rate_matrix_object.get_expected_rate() except RateMatrix.RateMatrixError as e: raise HandlingError('error calculating the expected rate: ' + str(e)) # return the response return str(expected_rate) + '\n'
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the log likelihood dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) log_likelihood = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # return the response return str(log_likelihood) + '\n'
def get_form(): """ @return: the body of a form """ # define the default rate matrix dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix() labels = list(sorted(set(a for a, b in dictionary_rate_matrix))) R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels) R = np.array(R) form_objects = [ Form.Matrix('matrix', 'rate matrix', R, MatrixUtil.assert_rate_matrix) ] return form_objects
def get_response_content(fs): # get the nucleotide distribution d = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the rate matrix defined by the nucleotide distribution and kappa rate_object = RateMatrix.get_unscaled_hky85_rate_matrix(d, fs.kappa) if fs.scaled: rate_object.normalize() rate_matrix = rate_object.get_dictionary_rate_matrix() # show the rate matrix in convenient text form out = StringIO() for nta in 'ACGT': print >> out, '\t'.join(str(rate_matrix[(nta, ntb)]) for ntb in 'ACGT') return out.getvalue()
def get_form(): """ @return: the body of a form """ # define the default rate matrix dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix() labels = list(sorted(set(a for a, b in dictionary_rate_matrix))) R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels) R = np.array(R) form_objects = [ Form.Matrix('matrix', 'rate matrix', R, MatrixUtil.assert_rate_matrix)] return form_objects
def get_response_content(fs): # get a properly formatted newick tree with branch lengths tree = Newick.parse(fs.tree, SpatialTree.SpatialTree) tree.assert_valid() if tree.has_negative_branch_lengths(): msg = 'drawing a tree with negative branch lengths is not implemented' raise HandlingError(msg) tree.add_branch_lengths() # get the dictionary mapping the branch name to the nucleotide name_to_nucleotide = {} # parse the column string for line in iterutils.stripped_lines(fs.column.splitlines()): name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line) if nucleotide_string not in list('acgtACGT'): msg = '"%s" is not a valid nucleotide' % nucleotide_string raise HandlingError(msg) nucleotide_string = nucleotide_string.upper() if name_string in name_to_nucleotide: raise HandlingError('the name "%s" was duplicated' % name_string) name_to_nucleotide[name_string] = nucleotide_string # augment the tips with the nucleotide letters for name, nucleotide in name_to_nucleotide.items(): try: node = tree.get_unique_node(name) except Newick.NewickSearchError as e: raise HandlingError(e) if node.children: msg = 'constraints on internal nodes are not implemented' raise HandlingError(msg) node.state = nucleotide # get the Jukes-Cantor rate matrix object dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) # simulate the ancestral nucleotides rate_matrix_object.simulate_ancestral_states(tree) # simulate a path on each branch # this breaks up the branch into a linear sequence of nodes and adds color for node in tree.gen_non_root_nodes(): simulate_branch_path(tree, node) # do the layout EqualArcLayout.do_layout(tree) # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] return DrawTreeImage.get_tree_image(tree, (640, 480), ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide distribution distribution = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the nucleotide alignment try: alignment = Fasta.Alignment(StringIO(fs.alignment)) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the rate matrix defined by the nucleotide distribution and kappa row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, fs.kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle rates mle_rates = get_mle_rates(tree, alignment, rate_matrix) # return the response return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def get_form(): """ @return: the body of a form """ # define the default rate matrix dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix() labels = Codon.g_sorted_non_stop_codons R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels) # define the form objects form_objects = [ Form.Matrix('matrix', 'codon rate matrix', R, MatrixUtil.assert_rate_matrix), Form.Integer('maxcategories', 'maximum number of categories', 5, low=2) ] return form_objects
def get_form(): """ @return: the body of a form """ # define the default rate matrix dictionary_rate_matrix = RateMatrix.get_sample_codon_rate_matrix() labels = Codon.g_sorted_non_stop_codons R = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, labels, labels) # define the form objects form_objects = [ Form.Matrix('matrix', 'codon rate matrix', R, MatrixUtil.assert_rate_matrix), Form.Integer('maxcategories', 'maximum number of categories', 5, low=2)] return form_objects
def gen_distance_matrices(self, count, max_steps): """ Yield (ordered sequence list, distance matrix) pairs . The generator will stop if it sees that it cannot meet its goal in the allotted number of steps. @param count: the requested number of distance matrices @param max_steps: an upper bound on the allowed number of steps """ # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) model = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # record the requested number of samples self.requested_matrix_count = count # do some rejection sampling while True: if self.get_complexity() >= max_steps: break if self.accepted_sample_count >= count: break # simulate an alignment from the tree alignment = PhyLikelihood.simulate_alignment( self.tree, model, self.sequence_length) # extract the ordered list of sequences from the alignment object name_to_sequence = dict(zip(alignment.headers, alignment.sequences)) sequence_list = [name_to_sequence[name] for name in self.ordered_names] # get the estimated distance matrix distance_matrix = JC69.get_ML_distance_matrix(sequence_list) # look for degeneracies has_zero_off_diagonal = False has_inf_off_diagonal = False for i, row in enumerate(distance_matrix): for j, value in enumerate(row): if i != j: if value == 0.0: has_zero_off_diagonal = True if value == float('inf'): has_inf_off_diagonal = True if has_zero_off_diagonal: self.rejected_zero_sample_count += 1 elif has_inf_off_diagonal: self.rejected_inf_sample_count += 1 else: self.accepted_sample_count += 1 yield sequence_list, distance_matrix
def get_response_content(fs): # read the nexus data nexus = Nexus.Nexus() try: nexus.load(StringIO(fs.nexus)) except Nexus.NexusError as e: raise HandlingError(e) # read the hyphy variables ns = Hyphy.get_hyphy_namespace(StringIO(fs.hyphy)) # get the mixture weights mixture_weights = [ns.P, 1.0 - ns.P] # get the nucleotide distributions nucleotide_distributions = [] for suffix in ("", "2"): distribution = {} for nt in list("ACGT"): var = "eqFreq" + nt + suffix proportion = getattr(ns, var) distribution[nt] = proportion nucleotide_distributions.append(distribution) # create the normalized nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution in nucleotide_distributions: rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix(nt_distribution, ns.kappa) rate_matrix_object.normalize() rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # scale each rate matrix object by its branch length ratio for rate_matrix_object, tree_name in zip(rate_matrix_objects, ("givenTree", "otherTree")): nexus_tree = nexus.tree hyphy_tree = getattr(ns, tree_name) try: nexus_human_node = nexus_tree.get_unique_node("Human") except Newick.NewickSearchError as e: raise HandlingError("nexus tree error: %s" % e) try: hyphy_human_node = hyphy_tree.get_unique_node("HUMAN") except Newick.NewickSearchError as e: raise HandlingError("hyphy tree error: %s" % e) sf = hyphy_human_node.blen / nexus_human_node.blen rate_matrix_object.rescale(sf) # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # return the results return do_analysis(mixture_model, nexus.alignment, nexus.tree) + "\n"
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the alignment try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(StringIO(fs.states)) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the distance matrix n = alignment.get_sequence_count() row_major_distance_matrix = [[0] * n for i in range(n)] for i, sequence_a in enumerate(alignment.sequences): for j, sequence_b in enumerate(alignment.sequences): if i < j: # create the objective function using the sequence pair objective = Objective((sequence_a, sequence_b), rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # fill two elements of the matrix row_major_distance_matrix[i][j] = mle_distance row_major_distance_matrix[j][i] = mle_distance # write the response out = StringIO() print >> out, 'maximum likelihood distance matrix:' print >> out, MatrixUtil.m_to_string(row_major_distance_matrix) return out.getvalue()
def __call__(self, rate): """ Return the negative likelihood of a column. The negative likelihood is computed using the tree, matrix, and rate. @param rate: the rate of the rate matrix @return: the negative likelihood of the column """ if not rate: inf = float('inf') neginf = float('-inf') states = [tip.state for tip in self.tree.gen_tips()] if len(set(states)) == 1: likelihood = 1 else: likelihood = 0 else: self.rate_matrix.set_rate(rate) likelihood = RateMatrix.get_likelihood(self.tree, self.rate_matrix) return -likelihood
def get_sample_mixture_model(): """ @return: a mixture model that is used to generate the default nexus data """ # define the model kappa = 2 category_distribution = [.1, .4, .5] nt_dicts = [ {'A' : .1, 'C' : .4, 'G' : .4, 'T' : .1}, {'A' : .2, 'C' : .3, 'G' : .3, 'T' : .2}, {'A' : .25, 'C' : .25, 'G' : .25, 'T' : .25} ] # create a mixture model from the variables that define the model rate_matrix_objects = [] for nt_dict in nt_dicts: rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_dict, kappa) rate_matrix_objects.append(rate_matrix_object) mixture_model = SubModel.MixtureModel( category_distribution, rate_matrix_objects) mixture_model.normalize() return mixture_model
def get_response_content(fs): # deserialize the xml data to create a DirectProteinMixture try: mixture_model = DirectProtein.deserialize_mixture_model(fs.model) except ValueError as e: raise HandlingError(e) # Normalize the mixture model to have an expected rate of one # substitution per unit of branch length. mixture_model.normalize() # begin writing the html file out = StringIO() # write the html header print >> out, '<html>' print >> out, '<head>' print >> out, '<style type="text/css">td{font-size:x-small;}</style>' print >> out, '</head>' print >> out, '<body>' # write the symmetric components of the rate matrices for category_i, matrix_object in enumerate(mixture_model.rate_matrices): codon_v = matrix_object.get_stationary_distribution() matrix = matrix_object.dictionary_rate_matrix symmetric_matrix = {} for ca, pa in zip(codons, codon_v): for cb, pb in zip(codons, codon_v): value = matrix[(ca, cb)] / (math.sqrt(pb) / math.sqrt(pa)) symmetric_matrix[(ca, cb)] = value print >> out, 'the symmetric component of the rate matrix' print >> out, 'for category %d:' % (category_i + 1) print >> out, '<table>' print >> out, RateMatrix.codon_rate_matrix_to_html_string( symmetric_matrix) print >> out, '</table>' print >> out, '<br/><br/>' # write the html footer print >> out, '</body>' print >> out, '</html>' # return the response return out.getvalue()
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide distribution distribution = SnippetUtil.get_distribution( fs.weights, 'nucleotide', list('ACGT')) # get the nucleotide alignment try: alignment = Fasta.Alignment(StringIO(fs.alignment)) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the rate matrix defined by the nucleotide distribution and kappa row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, fs.kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix( row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle rates mle_rates = get_mle_rates(tree, alignment, rate_matrix) # return the response return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def get_response(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # parse the tree try: tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() except Newick.NewickSyntaxError as e: raise HandlingError(str(e)) # get the mixture weights mixture_weights = [fs.weight_a, fs.weight_b] # get the kappa values kappa_values = [fs.kappa_a, fs.kappa_b] # get the nucleotide distributions frequency_strings = (fs.frequency_a, fs.frequency_b) nucleotide_distributions = [] for nt_string in frequency_strings: d = SnippetUtil.get_distribution(nt_string, 'nucleotide', list('ACGT')) nucleotide_distributions.append(d) # create the nucleotide HKY rate matrix objects rate_matrix_objects = [] for nt_distribution, kappa in zip(nucleotide_distributions, kappa_values): rate_matrix_object = RateMatrix.get_unscaled_hky85_rate_matrix( nt_distribution, kappa) rate_matrix_objects.append(rate_matrix_object) # create the mixture proportions weight_sum = sum(mixture_weights) mixture_proportions = [weight / weight_sum for weight in mixture_weights] # create the mixture model mixture_model = SubModel.MixtureModel( mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # simulate the alignment try: alignment = PhyLikelihood.simulate_alignment( tree, mixture_model, fs.ncols) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the output string output_string = '' if fs.fasta: # the output is the alignment arr = [] for node in tree.gen_tips(): arr.append(alignment.get_fasta_sequence(node.name)) alignment_string = '\n'.join(arr) output_string = alignment_string elif fs.nex: # the output is the alignment and the tree nexus = Nexus.Nexus() nexus.tree = tree nexus.alignment = alignment for i in range(2): arr = [] arr.append('weight: %s' % mixture_weights[i]) arr.append('kappa: %s' % kappa_values[i]) nexus.add_comment('category %d: %s' % (i+1, ', '.join(arr))) output_string = str(nexus) # define the filename if fs.fasta: filename_extension = 'fasta' elif fs.nex: filename_extension = 'nex' filename = 'sample.' + fs.fmt #TODO use the correct filename extension in the output return output_string
def get_response_content(fs): # init the response and get the user variables out = StringIO() nleaves = fs.nleaves nvertices = nleaves * 2 - 1 nbranches = nvertices - 1 nsites = fs.nsites # sample the coalescent tree with timelike branch lengths R, B = kingman.sample(fs.nleaves) r = Ftree.R_to_root(R) # get the leaf vertex names N = dict(zip(range(nleaves), string.uppercase[:nleaves])) N_leaves = dict(N) # get the internal vertex names v_to_leaves = R_to_v_to_leaves(R) for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: N[v] = ''.join(sorted(N[leaf] for leaf in leaves)) # get vertex ages v_to_age = kingman.RB_to_v_to_age(R, B) # sample the rates on the branches b_to_rate = sample_b_to_rate(R) xycorr = get_correlation(R, b_to_rate) # define B_subs in terms of substitutions instead of time B_subs = dict((p, t * b_to_rate[p]) for p, t in B.items()) # sample the alignment v_to_seq = sample_v_to_seq(R, B_subs, nsites) # get the log likelihood; this is kind of horrible pairs = [(N[v], ''.join(v_to_seq[v])) for v in range(nleaves)] headers, sequences = zip(*pairs) alignment = Fasta.create_alignment(headers, sequences) newick_string = FtreeIO.RBN_to_newick(R, B_subs, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) ll = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are all 1.0 newick_string = FtreeIO.RBN_to_newick(R, B, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) ll_unity = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are numerically optimized # TODO incorporate the result into the xml file # TODO speed up the likelihood evaluation (beagle? C module?) #f = Opt(R, B, N_leaves, alignment) #X_logs = [0.0] * nbranches #result = scipy.optimize.fmin(f, X_logs, full_output=True) #print result # print >> out, '<?xml version="1.0"?>' print >> out, '<beast>' print >> out print >> out, '<!-- actual rate autocorrelation', xycorr, '-->' print >> out, '<!-- actual root height', v_to_age[r], '-->' print >> out, '<!-- actual log likelihood', ll, '-->' print >> out, '<!-- ll if rates were unity', ll_unity, '-->' print >> out print >> out, '<!--' print >> out, 'predefine the taxa as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_leaf_taxon_defn(list(string.uppercase[:nleaves])) print >> out print >> out, '<!--' print >> out, 'define the alignment as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_alignment_defn(leaves, N, v_to_seq) print >> out print >> out, '<!--' print >> out, 'specify the starting tree as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, get_starting_tree_defn(R, B, N_leaves) print >> out print >> out, '<!--' print >> out, 'connect the tree model as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, g_tree_model_defn print >> out print >> out, g_uncorrelated_relaxed_clock_info print >> out """ print >> out, '<!--' print >> out, 'create a list of taxa for which to constrain the mrca as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_subset_defn(N, v, leaves) print >> out print >> out, '<!--' print >> out, 'create a tmrcaStatistic that will record the height as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_stat_defn(N[v]) """ print >> out print >> out, g_likelihood_info print >> out print >> out, '<!--' print >> out, 'run the mcmc' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' print >> out, get_mcmc_defn(v_to_leaves, v_to_age, N) print >> out print >> out, '</beast>' # return the response return out.getvalue()