def get_response_content(fs): # get the tree tree = Newick.parse(fs.tree, Newick.NewickTree) tree.assert_valid() # get the sequence order if it exists ordered_names = Util.get_stripped_lines(fs.order.splitlines()) if ordered_names: observed_name_set = set(ordered_names) expected_name_set = set(node.get_name() for node in tree.gen_tips()) extra_names = observed_name_set - expected_name_set missing_names = expected_name_set - observed_name_set if extra_names: msg_a = 'the list of ordered names includes these names ' msg_b = 'not found in the tree: %s' % str(tuple(extra_names)) raise HandlingError(msg_a + msg_b) if missing_names: msg_a = 'the tree includes these names not found in the list ' msg_b = 'of ordered names: %s' % str(tuple(missing_names)) raise HandlingError(msg_a + msg_b) else: ordered_names = list(tip.get_name() for name in tree.gen_tips()) # do the sampling sampled_sequences = JC69.sample_sequences(tree, ordered_names, fs.length) alignment = Fasta.create_alignment(ordered_names, sampled_sequences) # return the response return alignment.to_fasta_string() + '\n'
def get_amino_acid_alignment(table): """ @param table: a table of data in some random format sent by Ferran Casals @return: a Fasta amino acid alignment object """ if len(table) < 2: raise HandlingError('the data table should have at least two rows') first_row = table[0] if len(first_row) < 6: raise HandlingError( 'the first row of the table has %d columns ' 'but at least six were expected' % len(first_row)) if first_row[0].upper() != 'variant'.upper(): raise HandlingError('expected the first column to be the variant') if first_row[1].upper() != 'chr'.upper(): raise HandlingError('expected the second column to be the chromosome') if first_row[2].upper() != 'position'.upper(): raise HandlingError('expected the third column to be the position') if first_row[3].upper() != 'Amino Acid Change'.upper(): raise HandlingError( 'expected the fourth column to be the amino acid change') if first_row[4].upper() != 'alleles'.upper(): raise HandlingError( 'expected the fifth column to be the nucleotide change') remaining_rows = table[1:] for row in remaining_rows: if len(row) != len(first_row): raise HandlingError( 'each row should have the same number of columns') # get the ordered taxa taxa = first_row[5:] if len(set(taxa)) != len(taxa): raise HandlingError('the same taxon appears in more than one column') # get the sequence of codons for each taxon codon_sequences = zip(*remaining_rows)[5:] # convert codon sequences to amino acid sequences aa_sequences = [] for codon_sequence in codon_sequences: aa_list = [] for codon in codon_sequence: codon = codon.upper() if codon == 'ND': aa = '-' elif codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] elif codon in Codon.g_stop_codons: raise HandlingError( 'one of the codons is a stop codon: %s' % codon) else: raise HandlingError( 'one of the codons is invalid: %s' % codon) aa_list.append(aa) aa_sequences.append(''.join(aa_list)) # return the alignment return Fasta.create_alignment(taxa, aa_sequences)
def get_amino_acid_alignment(table): """ @param table: a table of data in some random format sent by Ferran Casals @return: a Fasta amino acid alignment object """ if len(table) < 2: raise HandlingError('the data table should have at least two rows') first_row = table[0] if len(first_row) < 6: raise HandlingError('the first row of the table has %d columns ' 'but at least six were expected' % len(first_row)) if first_row[0].upper() != 'variant'.upper(): raise HandlingError('expected the first column to be the variant') if first_row[1].upper() != 'chr'.upper(): raise HandlingError('expected the second column to be the chromosome') if first_row[2].upper() != 'position'.upper(): raise HandlingError('expected the third column to be the position') if first_row[3].upper() != 'Amino Acid Change'.upper(): raise HandlingError( 'expected the fourth column to be the amino acid change') if first_row[4].upper() != 'alleles'.upper(): raise HandlingError( 'expected the fifth column to be the nucleotide change') remaining_rows = table[1:] for row in remaining_rows: if len(row) != len(first_row): raise HandlingError( 'each row should have the same number of columns') # get the ordered taxa taxa = first_row[5:] if len(set(taxa)) != len(taxa): raise HandlingError('the same taxon appears in more than one column') # get the sequence of codons for each taxon codon_sequences = zip(*remaining_rows)[5:] # convert codon sequences to amino acid sequences aa_sequences = [] for codon_sequence in codon_sequences: aa_list = [] for codon in codon_sequence: codon = codon.upper() if codon == 'ND': aa = '-' elif codon in Codon.g_non_stop_codons: aa = Codon.g_codon_to_aa_letter[codon] elif codon in Codon.g_stop_codons: raise HandlingError('one of the codons is a stop codon: %s' % codon) else: raise HandlingError('one of the codons is invalid: %s' % codon) aa_list.append(aa) aa_sequences.append(''.join(aa_list)) # return the alignment return Fasta.create_alignment(taxa, aa_sequences)
def get_response_content(fs): # init the response and get the user variables out = StringIO() nleaves = fs.nleaves nvertices = nleaves * 2 - 1 nbranches = nvertices - 1 nsites = fs.nsites # sample the coalescent tree with timelike branch lengths R, B = kingman.sample(fs.nleaves) r = Ftree.R_to_root(R) # get the leaf vertex names N = dict(zip(range(nleaves), string.uppercase[:nleaves])) N_leaves = dict(N) # get the internal vertex names v_to_leaves = R_to_v_to_leaves(R) for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: N[v] = ''.join(sorted(N[leaf] for leaf in leaves)) # get vertex ages v_to_age = kingman.RB_to_v_to_age(R, B) # sample the rates on the branches b_to_rate = sample_b_to_rate(R) xycorr = get_correlation(R, b_to_rate) # define B_subs in terms of substitutions instead of time B_subs = dict((p, t * b_to_rate[p]) for p, t in B.items()) # sample the alignment v_to_seq = sample_v_to_seq(R, B_subs, nsites) # get the log likelihood; this is kind of horrible pairs = [(N[v], ''.join(v_to_seq[v])) for v in range(nleaves)] headers, sequences = zip(*pairs) alignment = Fasta.create_alignment(headers, sequences) newick_string = FtreeIO.RBN_to_newick(R, B_subs, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) ll = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are all 1.0 newick_string = FtreeIO.RBN_to_newick(R, B, N_leaves) tree = Newick.parse(newick_string, Newick.NewickTree) ll_unity = PhyLikelihood.get_log_likelihood( tree, alignment, rate_matrix_object) # get ll when rates are numerically optimized # TODO incorporate the result into the xml file # TODO speed up the likelihood evaluation (beagle? C module?) #f = Opt(R, B, N_leaves, alignment) #X_logs = [0.0] * nbranches #result = scipy.optimize.fmin(f, X_logs, full_output=True) #print result # print >> out, '<?xml version="1.0"?>' print >> out, '<beast>' print >> out print >> out, '<!-- actual rate autocorrelation', xycorr, '-->' print >> out, '<!-- actual root height', v_to_age[r], '-->' print >> out, '<!-- actual log likelihood', ll, '-->' print >> out, '<!-- ll if rates were unity', ll_unity, '-->' print >> out print >> out, '<!--' print >> out, 'predefine the taxa as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_leaf_taxon_defn(list(string.uppercase[:nleaves])) print >> out print >> out, '<!--' print >> out, 'define the alignment as in' print >> out, 'http://beast.bio.ed.ac.uk/Introduction_to_XML_format' print >> out, '-->' print >> out, get_alignment_defn(leaves, N, v_to_seq) print >> out print >> out, '<!--' print >> out, 'specify the starting tree as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, get_starting_tree_defn(R, B, N_leaves) print >> out print >> out, '<!--' print >> out, 'connect the tree model as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_4' print >> out, '-->' print >> out, g_tree_model_defn print >> out print >> out, g_uncorrelated_relaxed_clock_info print >> out """ print >> out, '<!--' print >> out, 'create a list of taxa for which to constrain the mrca as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_subset_defn(N, v, leaves) print >> out print >> out, '<!--' print >> out, 'create a tmrcaStatistic that will record the height as in' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' for v, leaves in sorted(v_to_leaves.items()): if len(leaves) > 1: print >> out, get_mrca_stat_defn(N[v]) """ print >> out print >> out, g_likelihood_info print >> out print >> out, '<!--' print >> out, 'run the mcmc' print >> out, 'http://beast.bio.ed.ac.uk/Tutorial_3.1' print >> out, '-->' print >> out, get_mcmc_defn(v_to_leaves, v_to_age, N) print >> out print >> out, '</beast>' # return the response return out.getvalue()