def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # define the jukes cantor rate matrix dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # simulate the ancestral alignment try: alignment = PhyLikelihood.simulate_ancestral_alignment( tree, alignment, rate_matrix_object) except PhyLikelihood.SimulationError as e: raise HandlingError(e) # get the alignment string using an ordering defined by the tree arr = [] for node in tree.preorder(): arr.append(alignment.get_fasta_sequence(node.name)) # return the response return '\n'.join(arr) + '\n'
def get_response_content(fs): # read the nucleotide weights nt_weights = [fs.A, fs.C, fs.G, fs.T] # convert the nucleotide weights to probabilities nt_probs = [x / float(sum(nt_weights)) for x in nt_weights] # Assert that the kappa value and the nucleotide # probabilities are compatible. A, C, G, T = nt_probs R = float(A + G) Y = float(C + T) if R <= 0: raise HandlingError('the frequency of a purine must be positive') if Y <= 0: raise HandlingError('the frequency of a pyrimidine must be positive') if fs.kappa <= max(-Y, -R): msg_a = 'kappa must be greater than max(-R, -Y) ' msg_b = 'where R and Y are the purine and pyrimidine frequencies' raise HandlingError(msg_a + msg_b) # Create the rate matrix object # which is automatically scaled to a rate of 1.0. model = F84.create_rate_matrix(fs.kappa, nt_probs) # simulate a pair of sequences sequence_pair = PairLikelihood.simulate_sequence_pair( fs.distance, model, fs.length) # convert the pair of sequences to an alignment object aln = StringIO() print >> aln, '>first' print >> aln, ''.join(sequence_pair[0]) print >> aln, '>second' print >> aln, ''.join(sequence_pair[1]) return Fasta.Alignment(StringIO(aln.getvalue())).to_fasta_string() + '\n'
def simulate_ancestral_alignment(tree, alignment, substitution_model): """ @param tree: a newick tree with branch lengths @param alignment: a Fasta Alignment object with headers that match the tree tip names @param substitution_model: a way to simulate ancestral states from a tree given its leaf states @return: a Fasta Alignment object of the simulated ancestral sequences """ for node in tree.gen_non_root_nodes(): if node.get_branch_length() is None or node.get_branch_length() <= 0: raise SimulationError('all branch lengths should be positive') for node in tree.gen_internal_nodes(): if not node.name: raise SimulationError('all internal nodes should be named') simulated_ancestors = dict((node.name, []) for node in tree.gen_internal_nodes()) for col in alignment.columns: name_to_letter = dict(zip(alignment.headers, col)) # Augment each tip with its corresponding letter. for tip in tree.gen_tips(): tip.state = name_to_letter[tip.name] # Do the simulation. substitution_model.simulate_ancestral_states(tree) name_state_pairs = [(node.name, node.state) for node in tree.gen_internal_nodes_preorder()] # Add this simulated column. for name, state in name_state_pairs: simulated_ancestors[name].append(state) # Create an alignment object from the simulated sequences. sio = StringIO() print >> sio, alignment.to_fasta_string() for header, sequence in simulated_ancestors.items(): print >> sio, '>' + header print >> sio, ''.join(sequence) fasta_string = sio.getvalue() return Fasta.Alignment(StringIO(fasta_string))
def main(): # create the alignment object print 'creating the alignment...' alignment_string = Fasta.brown_example_alignment.strip() alignment = Fasta.Alignment(StringIO(alignment_string)) # create a tree object print 'creating the tree...' tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) # create a rate matrix object print 'creating the rate matrix object...' distribution = {'A': .25, 'C': .25, 'G': .25, 'T': .25} kappa = 2.0 row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle_rates print 'getting the mle rates...' mle_rates = get_mle_rates(tree, alignment, rate_matrix) print 'mle rates:' print mle_rates print 'stockholm string:' print get_stockholm_string(tree, alignment, mle_rates)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the mixture weights weights = [fs.weight_a, fs.weight_b, fs.weight_c] # get the matrices matrices = [fs.matrix_a, fs.matrix_b, fs.matrix_c] for R in matrices: if R.shape != (4, 4): msg = 'expected each nucleotide rate matrix to be 4x4' raise HandlingError(msg) # get the nucleotide alignment try: alignment = Fasta.Alignment(fs.alignment.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # create the mixture proportions weight_sum = sum(weights) mixture_proportions = [weight / weight_sum for weight in weights] # create the rate matrix objects ordered_states = list('ACGT') rate_matrix_objects = [] for R in matrices: rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) rate_matrix_objects.append(rate_matrix_object) # create the mixture model mixture_model = SubModel.MixtureModel(mixture_proportions, rate_matrix_objects) # normalize the mixture model mixture_model.normalize() # return the html string return do_analysis(mixture_model, alignment, tree) + '\n'
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() != 2: raise HandlingError('expected a sequence pair') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(fs.states.splitlines()) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the objective function objective = Objective(alignment.sequences, rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # write the response out = StringIO() print >> out, 'maximum likelihood distance:', mle_distance #distances = (mle_distance, 0.2, 2.0, 20.0) #for distance in distances: #print >> out, 'f(%s): %s' % (distance, objective(distance)) return out.getvalue()
def test_simulation(self): tree_string = '(((Human:0.1, Chimpanzee:0.2)to-chimp:0.8, Gorilla:0.3)to-gorilla:0.7, Orangutan:0.4, Gibbon:0.5)all;' # Parse the example tree. tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Simulate ancestral states. simulated_alignment = simulate_ancestral_alignment(tree, alignment, rate_matrix_object)
def test_likelihood(self): # Parse the example tree. tree_string = Newick.brown_example_tree tree = Newick.parse(tree_string, Newick.NewickTree) tree.assert_valid() # Get header and sequence pairs. alignment = Fasta.Alignment(StringIO(Fasta.brown_example_alignment)) # Get the Jukes-Cantor rate matrix object. dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major(dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # Calculate the log likelihood. log_likelihood = get_log_likelihood(tree, alignment, rate_matrix_object) self.assertAlmostEqual(log_likelihood, -4146.26547208)
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide alignment try: alignment = Fasta.Alignment(fs.alignment.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the normalized Direct RNA mixture model mixture_model = DirectRna.deserialize_mixture_model(fs.model) mixture_model.normalize() # return the html string return do_analysis(mixture_model, alignment, tree) + '\n'
def get_response_content(fs): # read the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # Create the distance matrix, # replacing values of None with the representation for infinity. row_major_distance_matrix = [] for row in JC69.get_ML_distance_matrix(alignment.sequences): corrected_row = [fs.infinity if x == float('inf') else x for x in row] row_major_distance_matrix.append(corrected_row) # return the response return MatrixUtil.m_to_string(row_major_distance_matrix) + '\n'
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the alignment try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('fasta alignment error: ' + str(e)) if alignment.get_sequence_count() < 2: raise HandlingError('expected at least two sequences') # read the rate matrix R = fs.matrix # read the ordered states ordered_states = Util.get_stripped_lines(StringIO(fs.states)) if len(ordered_states) != len(R): msg_a = 'the number of ordered states must be the same ' msg_b = 'as the number of rows in the rate matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_states)) != len(ordered_states): raise HandlingError('the ordered states must be unique') # create the rate matrix object using the ordered states rate_matrix_object = RateMatrix.RateMatrix(R.tolist(), ordered_states) # create the distance matrix n = alignment.get_sequence_count() row_major_distance_matrix = [[0] * n for i in range(n)] for i, sequence_a in enumerate(alignment.sequences): for j, sequence_b in enumerate(alignment.sequences): if i < j: # create the objective function using the sequence pair objective = Objective((sequence_a, sequence_b), rate_matrix_object) # Use golden section search to find the mle distance. # The bracket is just a suggestion. bracket = (0.51, 2.01) mle_distance = optimize.golden(objective, brack=bracket) # fill two elements of the matrix row_major_distance_matrix[i][j] = mle_distance row_major_distance_matrix[j][i] = mle_distance # write the response out = StringIO() print >> out, 'maximum likelihood distance matrix:' print >> out, MatrixUtil.m_to_string(row_major_distance_matrix) return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(StringIO(fs.fasta)) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates according to a numeric optimizer. f = F84.Objective(alignment.sequences) values = list(f.get_initial_parameters()) result = scipy.optimize.fmin(f, values, ftol=1e-10, disp=0) distance, kappa, wC, wG, wT = result nt_distribution = F84.parameters_to_distribution((wC, wG, wT)) A, C, G, T = nt_distribution model = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, model) # begin the response out = StringIO() print >> out, 'ML distance:', distance print >> out, 'ML kappa:', kappa print >> out, 'ML A frequency:', A print >> out, 'ML C frequency:', C print >> out, 'ML G frequency:', G print >> out, 'ML T frequency:', T print >> out, 'log likelihood:', log_likelihood # write the response return out.getvalue()
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the alignment try: alignment = Fasta.Alignment(fs.fasta.splitlines()) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the log likelihood dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) log_likelihood = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # return the response return str(log_likelihood) + '\n'
def simulate_alignment(tree, substitution_model, ncolumns, seed=None): """ @param tree: a newick tree with branch lengths @param substitution_model: a way to simulate states on a tree @param ncolumns: the number of columns to simulate @param seed: a random number seed @return: a Fasta Alignment object of the simulated sequences """ # Check the input. for node in tree.gen_non_root_nodes(): if node.get_branch_length() is None or node.get_branch_length() <= 0: raise SimulationError('all branch lengths should be positive') tip_names = [node.name for node in tree.gen_tips()] for name in tip_names: if not name: raise SimulationError('each leaf should have a name') if len(tip_names) != len(set(tip_names)): raise SimulationError('each leaf should have a unique name') # Save the rng state if we are using a seed. if seed is not None: old_rng_state = random.getstate() # Seed the rng if we are using a seed. if seed is not None: random.seed(seed) # Simulate the states on the tree. simulated_sequences = dict((node.name, []) for node in tree.gen_tips()) for column_index in range(ncolumns): substitution_model.simulate_states(tree) for node in tree.gen_tips(): simulated_sequences[node.name].append(node.state) # Restore the rng state if we are using a seed if seed is not None: random.setstate(old_rng_state) # Create an alignment object from the simulated sequences. sio = StringIO() for header, sequence in simulated_sequences.items(): print >> sio, '>' + header print >> sio, ''.join(sequence) fasta_string = sio.getvalue() return Fasta.Alignment(StringIO(fasta_string))
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimates sequence_pair = alignment.sequences distance, kappa, A, C, G, T = F84.get_closed_form_estimates(sequence_pair) # get the log likelihood nt_distribution = (A, C, G, T) rate_matrix_object = F84.create_rate_matrix(kappa, nt_distribution) log_likelihood = PairLikelihood.get_log_likelihood(distance, alignment.sequences, rate_matrix_object) # begin the response out = StringIO() print >> out, 'distance:', distance print >> out, 'kappa:', kappa print >> out, 'A frequency:', A print >> out, 'C frequency:', C print >> out, 'G frequency:', G print >> out, 'T frequency:', T print >> out, 'log likelihood:', log_likelihood # return the response return out.getvalue()
def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the nucleotide distribution distribution = SnippetUtil.get_distribution(fs.weights, 'nucleotide', list('ACGT')) # get the nucleotide alignment try: alignment = Fasta.Alignment(StringIO(fs.alignment)) alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError(e) # get the rate matrix defined by the nucleotide distribution and kappa row_major_rate_matrix = RateMatrix.get_unscaled_hky85_rate_matrix( distribution, fs.kappa).get_row_major_rate_matrix() rate_matrix = RateMatrix.FastRateMatrix(row_major_rate_matrix, list('ACGT')) rate_matrix.normalize() # get the mle rates mle_rates = get_mle_rates(tree, alignment, rate_matrix) # return the response return get_stockholm_string(tree, alignment, mle_rates) + '\n'
def get_response_content(fs): out = StringIO() try: alignment = Fasta.Alignment(fs.fasta.splitlines()) print >> out, 'This is a valid alignment.' except Fasta.AlignmentError as e: alignment = None print >> out, 'This is not a valid alignment:', e if alignment: try: old_column_count = len(alignment.columns) alignment.force_nucleotide() removed_column_count = old_column_count - len(alignment.columns) if removed_column_count: print >> out, ('After removing %d' % removed_column_count), print >> out, 'columns this is a valid nucleotide alignment.' else: print >> out, 'This is a valid nucleotide alignment.' except Fasta.AlignmentError as e: print >> out, 'This is not a valid nucleotide alignment:', e for header, seq in Fasta.gen_header_sequence_pairs(StringIO(fs.fasta)): print >> out, '%s: %d' % (header, len(seq)) return out.getvalue()
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gap-free unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimate sequence_pair = alignment.sequences mle = JC69.get_ML_distance(*sequence_pair) # return the response return 'ML distance estimate: %f\n' % mle
def get_response_content(fs): # get the alignment object try: alignment = Fasta.Alignment(fs.fasta.splitlines()) except Fasta.AlignmentError as e: raise HandlingError('alignment error: ' + str(e)) # assert that the alignment is of exactly two sequences if len(alignment.sequences) != 2: raise HandlingError('expected a pair of sequences') # assert that the alignment is a gapless unambiguous nucleotide alignment old_column_count = alignment.get_column_count() try: alignment.force_nucleotide() except Fasta.AlignmentError as e: raise HandlingError('nucleotide alignment error: ' + str(e)) new_column_count = alignment.get_column_count() if old_column_count != new_column_count: msg = 'expected a gapless unambiguous nucleotide alignment' raise HandlingError(msg) # get the maximum likelihood estimate sequence_pair = alignment.sequences count = sum(a != b for a, b in zip(*alignment.sequences)) # return the response return 'difference count: %d\n' % count
def load(self, lines): """ @param lines: lines of nexus data """ # get the taxa, tree, and character lines taxa_lines = [] tree_lines = [] character_lines = [] current_array = None for line in iterutils.stripped_lines(lines): # Ignore an entire line that is a comment. # Nested comments and multi-line comments # are not correctly processed here. if line.startswith('[') and line.endswith(']'): self.add_comment(line[1:-1]) continue tokens = line.upper().split() if tokens == ['BEGIN', 'TAXA;']: current_array = taxa_lines elif tokens == ['BEGIN', 'TREES;']: current_array = tree_lines elif tokens == ['BEGIN', 'CHARACTERS;']: current_array = character_lines elif tokens == ['END;']: current_array = None elif current_array is not None: current_array.append(line) # assert that tree lines and character lines are present if not tree_lines: raise NexusError('TREES was not found') if not character_lines: raise NexusError('CHARACTERS was not found') # read the newick tree string nexus_tree_string = ''.join(tree_lines) if nexus_tree_string.count(';') != 1: raise NexusError( 'expected exactly one semicolon in the nexus TREES block') if nexus_tree_string.count('=') != 1: raise NexusError( 'expected exactly one equals sign in the nexus TREES block') offset = nexus_tree_string.find('=') newick_string = nexus_tree_string[offset + 1:] self.tree = Newick.parse(newick_string, Newick.NewickTree) # read the alignment matrix arr = [] found_matrix = False for line in character_lines: if line.upper().startswith('DIMENSIONS'): continue if line.upper().startswith('FORMAT'): continue if line.upper().startswith('MATRIX'): found_matrix = True continue if found_matrix: arr.append(line.replace(';', ' ')) if not arr: raise NexusError('no alignment was found') tokens = ' '.join(arr).split() if len(tokens) % 2 != 0: raise NexusError( 'expected the alignment to be a list of (taxon, sequence) pairs' ) alignment_out = StringIO() for header, sequence in iterutils.chopped(tokens, 2): sequence = sequence.upper() unexpected_letters = set(sequence) - set('ACGT') if unexpected_letters: raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters)) print >> alignment_out, '>%s' % header print >> alignment_out, sequence alignment_string = alignment_out.getvalue() self.alignment = Fasta.Alignment(StringIO(alignment_string))
def make_sample_alignment(): """ Make a sample alignment object. """ return Fasta.Alignment(Fasta.example_fasta_aligned.splitlines())