def setUp(self): self.tb1 = dendropy.TaxonSet(label="TI1") for i in range(1, 11): self.tb1.new_taxon(label="T%02d" % i) self.cb1 = dendropy.DnaCharacterMatrix(taxon_set=self.tb1, label="TI1, CA1") for t in self.tb1: self.cb1.append_taxon_sequence(t, state_symbols="AAAAAAAAAA") self.tb2 = dendropy.TaxonSet(label="TI2") for i in range(1, 21): self.tb2.new_taxon(label="T%02d" % i) self.cb2 = dendropy.DnaCharacterMatrix(taxon_set=self.tb2, label="TI2, CA2") for t in self.tb2: self.cb2.append_taxon_sequence(t, state_symbols="CCCCCCCCCC")
def testIncompatibleRead(self): c = dendropy.DnaCharacterMatrix() self.assertRaises(ValueError, c.read_from_path, self.data_path, "nexus", matrix_offset=1)
def testCopyConstruction(self): chars1 = self.dataset.char_matrices[0] chars2 = dendropy.DnaCharacterMatrix(chars1) self.assertDistinctButEqualDiscreteCharMatrix(chars1, chars2, distinct_taxa=False, distinct_state_alphabets=False)
def testFromDnaCharMatrix(self): ca2 = dendropy.DnaCharacterMatrix(self.char_matrix1) self.assertDistinctButEqual(self.char_matrix1, ca2, char_type=dendropy.DnaCharacterMatrix, distinct_state_alphabets=False, distinct_taxa=False)
def testInitRead(self): c = dendropy.DnaCharacterMatrix(stream=open(self.data_path, "rU"), schema="nexus") self.assertDistinctButEqual(self.reference_dataset.char_matrices[0], c, char_type=dendropy.DnaCharacterMatrix, distinct_state_alphabets=False, distinct_taxa=True)
def testNonIndexedRead(self): c = dendropy.DnaCharacterMatrix() c.read_from_path(self.data_path, "nexus") self.assertDistinctButEqual(self.reference_dataset.char_matrices[0], c, char_type=dendropy.DnaCharacterMatrix, distinct_state_alphabets=False, distinct_taxa=True)
def testSameTaxaInit(self): c = dendropy.DnaCharacterMatrix( stream=open(self.data_path, "rU"), schema="nexus", taxon_set=self.reference_dataset.char_matrices[0].taxon_set) self.assertDistinctButEqual(self.reference_dataset.char_matrices[0], c, char_type=dendropy.DnaCharacterMatrix, distinct_state_alphabets=False, distinct_taxa=False, ignore_chartypes=True)
def testSameTaxaRead(self): c = dendropy.DnaCharacterMatrix() c.read_from_path( self.data_path, schema="nexus", taxon_set=self.reference_dataset.char_matrices[0].taxon_set) self.assertDistinctButEqual(self.reference_dataset.char_matrices[0], c, char_type=dendropy.DnaCharacterMatrix, distinct_state_alphabets=False, distinct_taxa=False)
def generate_char_matrix(seq_len, tree_model, seq_model, mutation_rate=1.0, root_states=None, char_matrix=None, rng=None): """ Wrapper to conveniently generate a characters simulated under the given tree and character model. `seq_len` : length of sequence (number of characters) `tree_model` : dendropy.trees.Tree object `seq_model` : dendropy.seqmodel.SeqModel object `mutation_rate` : mutation *modifier* rate (should be 1.0 if branch lengths on tree reflect true expected number of changes `root_states` : vector of root states (length must equal `seq_len`) `char_matrix` : dendropy.CharacterMatrix object. if given, new sequences for taxa on `tree_model` leaf_nodes will be appended to existing sequences of corresponding taxa in char_matrix; if not, a new dendropy.CharacterMatrix object will be created `rng` : random number generator; if not given, `GLOBAL_RNG` will be used Returns: a dendropy.CharacterMatrix object. Since characters will be appended to existing sequences, you can simulate a sequences under a mixed model by calling this method multiple times with different character models and/or different mutation rates, passing in the same `char_matrix` object each time. """ seq_evolver = SeqEvolver(seq_model=seq_model, mutation_rate=mutation_rate) tree = seq_evolver.evolve_states(tree=tree_model, seq_len=seq_len, root_states=None, rng=rng) char_map = seq_evolver.compose_char_map(tree, tree.taxon_set) if char_matrix is None: char_matrix = dendropy.DnaCharacterMatrix() char_matrix.taxon_set = tree_model.taxon_set if char_matrix.taxon_set is None: char_matrix.taxon_set = tree_model.taxon_set else: assert char_matrix.taxon_set is tree_model.taxon_set, "conflicting taxon sets" char_matrix.extend_map(other_map=char_map, overwrite_existing=False, extend_existing=True) return char_matrix
def setUpClass(cls): d = dendropy.DnaCharacterMatrix() cls.original_labels = [ "a0_123456789_1", "a0_123456789_2", "a0_123456789_3", "a0_123456789_4", "a0_123456789_5", "b0_123456789_1", "b0_123456789_2", "b0_123456789_3", "b0_123456789_4", ] for label in cls.original_labels: t = d.taxon_namespace.require_taxon(label=label) d[t] = d.default_state_alphabet.get_states_for_symbols("AACGT") cls.data = d
def run_method(method, tree, seqs, threshold=None): ch_list = list() taxons = [x.label for x in tree.taxon_namespace] for t in taxons: ch_list.append([x.symbol for x in seqs[t]]) ch_arr = np.array(ch_list) ch_arr[ch_arr == "U"] = "T" ch_dendro = dendropy.DnaCharacterMatrix() ch_dendro.taxon_namespace = tree.taxon_namespace for t, taxon in enumerate(taxons): ch_dendro.new_sequence(tree.taxon_namespace[t], ch_arr[t, :].tolist()) if method == "RaXML": raxml_HKY = reconstruct_tree.RAxML() start_time = time.time() tree_rec = raxml_HKY(ch_dendro, raxml_args="-T 2 --HKY85 -c 1") if method == "SNJ": snj = reconstruct_tree.SpectralNeighborJoining( reconstruct_tree.HKY_similarity_matrix_missing_data) start_time = time.time() tree_rec = snj(ch_arr, tree.taxon_namespace) if method == "NJ": nj = reconstruct_tree.NeighborJoining( reconstruct_tree.HKY_similarity_matrix_missing_data) start_time = time.time() tree_rec = nj(ch_arr, tree.taxon_namespace) if method == "STR + NJ": spectral_method = reconstruct_tree.SpectralTreeReconstruction( reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix_missing_data) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reonstruction( ch_arr, reconstruct_tree.HKY_similarity_matrix_missing_data, taxon_namespace=tree.taxon_namespace, threshhold=threshold, min_split=5) if method == "STR + SNJ": spectral_method = reconstruct_tree.SpectralTreeReconstruction( reconstruct_tree.SpectralNeighborJoining, reconstruct_tree.HKY_similarity_matrix_missing_data) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reonstruction( ch_arr, reconstruct_tree.HKY_similarity_matrix_missing_data, taxon_namespace=tree.taxon_namespace, threshhold=threshold, min_split=5) if method == "STR + RaXML": spectral_method = reconstruct_tree.SpectralTreeReconstruction( reconstruct_tree.RAxML, reconstruct_tree.HKY_similarity_matrix_missing_data) start_time = time.time() tree_rec = spectral_method.deep_spectral_tree_reonstruction( ch_arr, reconstruct_tree.HKY_similarity_matrix_missing_data, taxon_namespace=tree.taxon_namespace, threshhold=threshold, raxml_args="-T 2 --HKY85 -c 1", min_split=5) runtime = time.time() - start_time RF, F1 = reconstruct_tree.compare_trees(tree_rec, tree) print(method) if threshold is not None: print(threshold) print("--- %s seconds ---" % runtime) print("RF = ", RF) print("F1% = ", F1) return ([method, str(threshold), runtime, RF, F1])
def hky85_chars(seq_len, tree_model, mutation_rate=1.0, kappa=1.0, base_freqs=[0.25, 0.25, 0.25, 0.25], root_states=None, char_matrix=None, retain_sequences_on_tree=False, rng=None): """ Convenience class to wrap generation of characters (as a CharacterBlock object) based on the HKY model. Parameters ---------- seq_len : int Length of sequence (number of characters). tree_model : |Tree| Tree on which to simulate. mutation_rate : float Mutation *modifier* rate (should be 1.0 if branch lengths on tree reflect true expected number of changes). root_states`` : list Vector of root states (length must equal ``seq_len``). char_matrix : |DnaCharacterMatrix| If given, new sequences for taxa on ``tree_model`` leaf_nodes will be appended to existing sequences of corresponding taxa in char_matrix; if not, a new |DnaCharacterMatrix| object will be created. retain_sequences_on_tree : bool If |False|, sequence annotations will be cleared from tree after simulation. Set to |True| if you want to, e.g., evolve and accumulate different sequences on tree, or retain information for other purposes. rng : random number generator If not given, 'GLOBAL_RNG' will be used. Returns ------- d : |DnaCharacterMatrix| The simulated alignment. Since characters will be appended to existing sequences, you can simulate a sequences under a mixed model by calling this method multiple times with different character model parameter values and/or different mutation rates, passing in the same ``char_matrix`` object each time. """ if char_matrix is None: char_matrix = dendropy.DnaCharacterMatrix( taxon_namespace=tree_model.taxon_namespace) else: assert char_matrix.taxon_namespace is tree_model.taxon_namespace state_alphabet = char_matrix.default_state_alphabet seq_model = Hky85(kappa=kappa, base_freqs=base_freqs, state_alphabet=state_alphabet) return simulate_discrete_chars(seq_len=seq_len, tree_model=tree_model, seq_model=seq_model, mutation_rate=mutation_rate, root_states=root_states, char_matrix=char_matrix, rng=rng)
def simulate_discrete_chars(seq_len, tree_model, seq_model, mutation_rate=1.0, root_states=None, char_matrix=None, retain_sequences_on_tree=False, rng=None): """ Wrapper to conveniently generate a characters simulated under the given tree and character model. Since characters will be appended to existing sequences, you can simulate a sequences under a mixed model by calling this method multiple times with different character models and/or different mutation rates, passing in the same ``char_matrix`` object each time. Parameters ---------- seq_len : int Length of sequence (number of characters). tree_model : |Tree| Tree on which to simulate. seq_model : dendropy.model.discrete.DiscreteCharacterEvolutionModel The character substitution model under which to to evolve the characters. mutation_rate : float Mutation *modifier* rate (should be 1.0 if branch lengths on tree reflect true expected number of changes). root_states`` : list Vector of root states (length must equal ``seq_len``). char_matrix : |DnaCharacterMatrix| If given, new sequences for taxa on ``tree_model`` leaf_nodes will be appended to existing sequences of corresponding taxa in char_matrix; if not, a new |DnaCharacterMatrix| object will be created. retain_sequences_on_tree : bool If |False|, sequence annotations will be cleared from tree after simulation. Set to |True| if you want to, e.g., evolve and accumulate different sequences on tree, or retain information for other purposes. rng : random number generator If not given, 'GLOBAL_RNG' will be used. Returns ------- d : a dendropy.datamodel.CharacterMatrix object. """ seq_evolver = DiscreteCharacterEvolver(seq_model=seq_model, mutation_rate=mutation_rate) tree = seq_evolver.evolve_states(tree=tree_model, seq_len=seq_len, root_states=None, rng=rng) if char_matrix is None: char_matrix = dendropy.DnaCharacterMatrix( taxon_namespace=tree_model.taxon_namespace) char_matrix.taxon_namespace = tree_model.taxon_namespace else: assert char_matrix.taxon_namespace is tree_model.taxon_namespace, "conflicting taxon sets" seq_evolver.extend_char_matrix_with_characters_on_tree( char_matrix=char_matrix, tree=tree) if not retain_sequences_on_tree: seq_evolver.clean_tree(tree) return char_matrix