def testSimpleCopyStandardMatrix(self): char_matrix = datagen.reference_standard_matrix() ds1 = dendropy.DataSet(char_matrix) self.assertEqual(len(ds1.char_matrices), 1) self.assertIs(ds1.char_matrices[0], char_matrix) ds2 = dendropy.DataSet(ds1) self.assertDistinctButEqual(ds1, ds2, ignore_chartypes=True)
def test_missing_tips(self): ds1 = dendropy.DataSet() ds1.read_from_string(('>a\n' 'ATCG\n' '>b\n' 'ATCG\n' '>c\n' 'ATCG\n' '>d\n' 'ATCG\n'), schema='fasta', data_type='dna') ds2 = dendropy.DataSet() tree_str = '((a,b),c);' ds2.read_from_string(tree_str, schema='newick') extra, missing = treeholder.check_taxon_labels( ds2.tree_lists[-1].taxon_set, ds1) self.assertEqual(len(missing), 1) self.assertEqual(len(extra), 0) self.assertEqual(missing[0], 'd') tree_stream = StringIO() tree_stream.write(tree_str) self.assertRaises(TaxaLabelsMismatchError, treeholder.read_trees_into_dataset, ds1, tree_stream, starting_tree=True)
def testSimpleCopyDnaMatrix(self): char_matrix = datagen.reference_dna_matrix() ds1 = dendropy.DataSet(char_matrix) self.assertEqual(len(ds1.char_matrices), 1) self.assertIs(ds1.char_matrices[0], char_matrix) ds2 = dendropy.DataSet(ds1) self.assertDistinctButEqual(ds1, ds2)
def generate_pruned_trees( src_trees_fname, num_reps, num_trees_per_rep): rng = random.Random() trees = dendropy.TreeList.get_from_path( src=pathmap.tree_source_path(src_trees_fname), schema='nexus') taxa = trees.taxon_set # print "1 >>>>", id(taxa), ":", len(taxa) # for t in taxa: # print repr(t) # input_trees = open(output_prepruned_tree_file_path, "w") # output_trees = open(output_postpruned_tree_file_path, "w") input_dataset = dendropy.DataSet(attached_taxon_set=taxa) output_dataset = dendropy.DataSet(attached_taxon_set=taxa) pruned_taxa = [] retained_taxa = [] for rep in range(num_reps): sub_trees = [dendropy.Tree(t, taxon_set=taxa) for t in rng.sample(trees, num_trees_per_rep)] sub_trees = dendropy.TreeList(sub_trees, taxon_set=taxa) sub_size = rng.randint(5, len(taxa)-5) assert sub_size > 0 assert sub_size < len(taxa) sub_taxa = rng.sample(taxa, sub_size) assert len(sub_taxa) > 4 assert len(sub_taxa) < len(taxa) # if retain_taxa_in_list: # taxa_to_prune = [t for t in taxa if t not in sub_taxa] # taxa_to_retain = sub_taxa # else: # taxa_to_prune = sub_taxa # taxa_to_retain = [t for t in taxa if t not in sub_taxa] taxa_to_prune = sub_taxa taxa_to_retain = [t for t in taxa if t not in sub_taxa] pruned_trees = paup.prune_taxa_from_trees(sub_trees, taxa_to_prune) pruned_taxa.append(taxa_to_prune) retained_taxa.append(taxa_to_retain) assert sub_trees.taxon_set is taxa input_dataset.add_tree_list(sub_trees) assert pruned_trees.taxon_set is taxa output_dataset.add_tree_list(pruned_trees) # print "2 >>>>", id(taxa), ":", len(taxa) # for t in taxa: # print repr(t) for trees in input_dataset.tree_lists: assert trees.taxon_set is taxa for tree in trees: assert tree.taxon_set is taxa count = 0 for nd in tree.postorder_node_iter(): if nd.taxon is not None: count += 1 assert count == len(taxa) for trees in output_dataset.tree_lists: assert trees.taxon_set is taxa for tree in trees: assert tree.taxon_set is taxa return taxa, pruned_taxa, retained_taxa, input_dataset, output_dataset
def generate( self, trees, dataset=None, taxon_namespace=None, input_sequences=None, **kwargs): args=self._compose_arguments() # with open("x.txt", "w") as inputf: with self.get_tempfile() as inputf: if input_sequences is not None: input_sequences.write_to_stream(inputf, schema="phylip",) inputf.write("{}\n".format(len(trees))) trees.write_to_stream(inputf, "newick", suppress_rooting=True, suppress_internal_node_labels=True) inputf.flush() args.append(inputf.name) # print("seq-gen args: = %s" % " ".join(args)) run = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = processio.communicate(run) if stderr or run.returncode != 0: raise RuntimeError("Seq-gen error: %s" % stderr) if taxon_namespace is None: taxon_namespace = trees.taxon_namespace if dataset is None: dataset = dendropy.DataSet(**kwargs) if taxon_namespace is not None: dataset.attach_taxon_namespace(taxon_namespace) dataset.read(data=stdout, schema="nexus") return dataset
def generate_hky_dataset(seq_len, tree_model, mutation_rate=1.0, kappa=1.0, base_freqs=[0.25, 0.25, 0.25, 0.25], root_states=None, dataset=None, rng=None): deprecate.dendropy_deprecation_warning( preamble= "Deprecated since DendroPy 4: The 'dendropy.seqsim.generate_hky_dataset()' function has been replaced with 'dendropy.simulate.charsim.hky85_chars()'.", old_construct= "from dendropy import seqsim\ndataset = seqsim.generate_hky_dataset(...)", new_construct= "import dendropy\nfrom dendropy.simulate import charsim\ndataset = dendropy.DataSet()\nchar_matrix = charsim.hky85_chars(...)\ndataset.add_char_matrix(char_matrix)" ) if dataset is None: dataset = dendropy.DataSet() char_matrix = dataset.new_char_matrix( char_matrix_type="dna", taxon_namespace=tree_model.taxon_namespace) charsim.hky85_chars(seq_len=seq_len, tree_model=tree_model, mutation_rate=mutation_rate, kappa=kappa, base_freqs=base_freqs, root_states=root_states, char_matrix=char_matrix, rng=rng) return dataset
def test_basic_read(self): src_filename = "standard-test-chars-multiple-char-blocks.1.basic.nexus" src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet() result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 4)) self.verify_dataset(ds)
def testBindAndUnbind(self): d = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") _LOG.info(d.taxon_sets[0].description(2)) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def test_basic_new_char_matrix(self): ds = dendropy.DataSet() item_labels = ["a", "b", "c", "d", "e", "f"] cm_type = [ "dna", "protein", "standard", dendropy.DnaCharacterMatrix, dendropy.ProteinCharacterMatrix, dendropy.StandardCharacterMatrix, ] expected_cm_types = [ dendropy.DnaCharacterMatrix, dendropy.ProteinCharacterMatrix, dendropy.StandardCharacterMatrix, dendropy.DnaCharacterMatrix, dendropy.ProteinCharacterMatrix, dendropy.StandardCharacterMatrix, ] item_list = [] for item_label, cm_type in zip(item_labels, cm_type): item = ds.new_char_matrix(label=item_label, char_matrix_type=cm_type) item_list.append(item) self.assertEqual(len(ds.char_matrices), len(item_labels)) self.assertEqual(len(ds.char_matrices), len(item_list)) for t1, t2, label, expected_cm_types in zip(ds.char_matrices, item_list, item_labels, expected_cm_types): self.assertTrue(isinstance(t1, expected_cm_types)) self.assertIs(t1, t2) self.assertEqual(t1.label, label)
def generate_sequences(self, species_name, samples_per_pop=10, seq_len=2000, use_seq_gen=True): self.generate_pop_tree(species_name=species_name, samples_per_pop=samples_per_pop) self.generate_gene_tree(species_name=species_name, samples_per_pop=samples_per_pop) d = dendropy.DataSet(self.mutation_tree.taxon_set) if SEQGEN and use_seq_gen: sg = seqgen.SeqGen() sg.seqgen_path = self.seqgen_path sg.num_replicates = 1 sg.quiet = True sg.rng = self.rng sg.seq_len = seq_len sg.char_model = 'HKY' sg.ti_tv = float(self.kappa) / 2 sg.state_freqs = self.base_freqs sg.trees = [self.mutation_tree] d = sg.generate_dataset(dataset=d) return d else: return seqsim.generate_hky_dataset(seq_len=seq_len, tree_model=self.mutation_tree, mutation_rate=1.0, kappa=1.0, base_freqs=[0.25, 0.25, 0.25, 0.25], root_states=None, dataset=d, rng=self.rng)
def testBasicDocumentFromInit(self): test_dataset = dendropy.DataSet(stream=pathmap.tree_source_stream( datagen.reference_trees_filename(schema="newick")), schema="newick") self.assertDistinctButEqual(self.reference_dataset, test_dataset, ignore_taxon_order=True)
def generate_sequences(self, species_name, samples_per_pop=10, seq_len=2000, use_seq_gen=True): self.generate_pop_tree(species_name=species_name, samples_per_pop=samples_per_pop) self.generate_gene_tree(species_name=species_name, samples_per_pop=samples_per_pop) d = dendropy.DataSet(self.mutation_tree.taxon_namespace) if self.use_seq_gen is True: sg = seqgen.SeqGen() sg.seqgen_path = self.seqgen_path sg.num_replicates = 1 sg.quiet = True sg.rng = self.rng sg.seq_len = seq_len sg.char_model = 'HKY' sg.ti_tv = float(self.kappa) / 2 sg.state_freqs = self.base_freqs sg.trees = [self.mutation_tree] d = sg.generate_dataset(dataset=d) else: char_matrix = discrete.hky85_chars( seq_len=seq_len, tree_model=self.mutation_tree, mutation_rate=1.0, kappa=1.0, base_freqs=[0.25, 0.25, 0.25, 0.25], root_states=None, rng=self.rng) d.add_char_matrix(char_matrix) return d
def testBindToSpecifiedTaxonSet(self): d = dendropy.DataSet() t = dendropy.TaxonSet() d.attach_taxon_set(t) self.assertEqual(len(d.taxon_sets), 1) self.assertIs(d.taxon_sets[0], d.attached_taxon_set) self.assertIs(d.attached_taxon_set, t) d.read_from_path( pathmap.mixed_source_path('reference_single_taxonset_dataset.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path(pathmap.tree_source_path('pythonidae.mle.nex'), "nexus") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.read_from_path( pathmap.tree_source_path('pythonidae.reference-trees.newick'), "newick") self.assertEqual(len(d.taxon_sets), 1) self.assertEqual(len(d.taxon_sets[0]), 33) d.detach_taxon_set() d.read_from_path( pathmap.char_source_path('caenophidia_mos.chars.fasta'), "proteinfasta") self.assertEqual(len(d.taxon_sets), 2) self.assertEqual(len(d.taxon_sets[0]), 33) self.assertEqual(len(d.taxon_sets[1]), 114)
def testBindAndUnbind(self): d = dendropy.DataSet() t = dendropy.TaxonNamespace() d.attach_taxon_namespace(t) self.assertEqual(len(d.taxon_namespaces), 1) self.assertIs(d.taxon_namespaces[0], d.attached_taxon_namespace) self.assertIs(d.attached_taxon_namespace, t) d.read(path=pathmap.mixed_source_path( 'reference_single_taxonset_dataset.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read(path=pathmap.tree_source_path('pythonidae.mle.nex'), schema="nexus") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.read( path=pathmap.tree_source_path('pythonidae.reference-trees.newick'), schema="newick") self.assertEqual(len(d.taxon_namespaces), 1) self.assertEqual(len(d.taxon_namespaces[0]), 33) d.detach_taxon_namespace() d.read(path=pathmap.char_source_path('caenophidia_mos.chars.fasta'), schema="fasta", data_type="protein") self.assertEqual(len(d.taxon_namespaces), 2) self.assertEqual(len(d.taxon_namespaces[0]), 33) self.assertEqual(len(d.taxon_namespaces[1]), 114)
def life_ott(fo): ''' Makes a bogus ott for the taxonomy. Basically this gives all the taxa the parent 'life' for later use in the TAG algorithm. ''' output = sys.stdout dataset = dendropy.DataSet() try: dataset.read(stream=fo, schema="Newick") except DataParseError as dfe: raise ValueError(str(dfe)) tree_list = dataset.tree_lists[0] parent_id = '805080' branch_counter = 0 tree_labels = set() for tree in tree_list: encode_splits(tree) tree_mask = tree.seed_node.edge.split_bitmask assert tree_mask is not None tree_tax = set(split_to_list(tree_mask)) split_list = [] for node in tree.leaf_iter(): tree_labels.add(node.taxon.label) output.write('805080\t|\t\t|\tlife\t|\tno rank\t|\tncbi:1,gbif:0\t|\t\t|\t\t|\t\t|\t\n') for i in tree_labels: name, ottoid = i.split('@') output.write(ottoid+'\t|\t'+parent_id+'\t|\t'+name+'\t|\tspecies\t|\tncbi:1\t|\t\t|\t\n')
def test_basic_add_taxon_namespace(self): expected_fundamental_states = set() ds = dendropy.DataSet() for tns in self.expected_taxon_namespaces: ds.add_taxon_namespace(tns) self.assertEqual(len(ds.taxon_namespaces), len(self.expected_taxon_namespaces)) for x1, x2 in zip(ds.taxon_namespaces, self.expected_taxon_namespaces): self.assertIs(x1, x2)
def test_match(self): ds1 = dendropy.DataSet() ds1.read_from_string(('>a\n' 'ATCG\n' '>b\n' 'ATCG\n' '>c\n' 'ATCG\n'), schema='fasta', data_type='dna') ds2 = dendropy.DataSet() tree_str = '((a,b),c);' ds2.read_from_string(tree_str, schema='newick') extra, missing = treeholder.check_taxon_labels( ds2.tree_lists[-1].taxon_set, ds1) self.assertEqual(len(missing), 0) self.assertEqual(len(extra), 0)
def test_attached_taxon_namespace_new_tree_list(self): ds = dendropy.DataSet() tns = dendropy.TaxonNamespace() ds.attach_taxon_namespace(tns) tree_list = ds.new_tree_list(label="q") self.assertEqual(tree_list.label, "q") self.assertIn(tree_list, ds.tree_lists) self.assertIs(tree_list.taxon_namespace, ds.attached_taxon_namespace) self.assertEqual(len(ds.taxon_namespaces), 1)
def test_attached_taxon_namespace_new_char_matrix(self): ds = dendropy.DataSet() tns = dendropy.TaxonNamespace() ds.attach_taxon_namespace(tns) char_matrix = ds.new_char_matrix(label="q", char_matrix_type="dna") self.assertEqual(char_matrix.label, "q") self.assertIn(char_matrix, ds.char_matrices) self.assertIs(char_matrix.taxon_namespace, ds.attached_taxon_namespace) self.assertEqual(len(ds.taxon_namespaces), 1)
def simulate_discrete_char_dataset(seq_len, tree_model, seq_model, mutation_rate=1.0, root_states=None, dataset=None, rng=None): """ Wrapper to conveniently generate a DataSet simulated under the given tree and character model. Parameters ---------- seq_len : int Length of sequence (number of characters). tree_model : |Tree| Tree on which to simulate. seq_model : dendropy.model.discrete.DiscreteCharacterEvolutionModel The character substitution model under which to to evolve the characters. mutation_rate : float Mutation *modifier* rate (should be 1.0 if branch lengths on tree reflect true expected number of changes). root_states`` : list Vector of root states (length must equal ``seq_len``). dataset : |DataSet| If given, the new dendropy.CharacterMatrix object will be added to this (along with a new taxon_namespace if required). Otherwise, a new dendropy.DataSet object will be created. rng : random number generator If not given, 'GLOBAL_RNG' will be used. Returns ------- d : |DataSet| """ if dataset is None: dataset = dendropy.DataSet() if tree_model.taxon_namespace not in dataset.taxon_namespaces: taxon_namespace = dataset.add_taxon_namespace( tree_model.taxon_namespace) else: taxon_namespace = tree_model.taxon_namespace char_matrix = simulate_discrete_chars(seq_len=seq_len, tree_model=tree_model, seq_model=seq_model, mutation_rate=mutation_rate, root_states=root_states, char_matrix=None, rng=None) dataset.add_char_matrix(char_matrix=char_matrix) return dataset
def testAsStrReading(self): dataset = dendropy.DataSet( stream=open(pathmap.char_source_path("bad_names.fasta"), "rU"), schema='fasta', data_type='dna', row_type='str' ) taxon_set = dataset.taxon_sets[0] label = [i.label for i in taxon_set] expected = ['a Bad name', 'another', 'a Badn,ame', 'a nothe++-_=+r', 'an!@#$o^&*()}{_ther'] self.assertEquals(label, expected)
def test_basic_read(self): src_filename = "standard-test-mixed.1.basic.nexus" src_path = pathmap.mixed_source_path(src_filename) ds = dendropy.DataSet() result = ds.read( path=src_path, schema="nexus", suppress_internal_node_taxa= False, # so internal labels get translated ) self.assertEqual(result, (1, 7, 4)) self.verify_dataset(ds)
def test_attached_taxon_namespace_default(self): ds = dendropy.DataSet() tns = dendropy.TaxonNamespace() ds.attach_taxon_namespace(tns) self.assertTrue(isinstance(tns, dendropy.TaxonNamespace)) self.assertEqual(len(ds.taxon_namespaces), 1) self.assertIn(tns, ds.taxon_namespaces) self.assertIs(ds.taxon_namespaces[0], tns) self.assertIs(ds.attached_taxon_namespace, tns) tns2 = ds.detach_taxon_namespace() self.assertIs(ds.attached_taxon_namespace, None) self.assertIs(tns2, tns)
def testFromNew(self): dataset = dendropy.DataSet(attach_taxon_set=True) self.assertEqual(len(dataset.taxon_sets), 1) taxa = dataset.taxon_sets[0] self.assertEqual(len(taxa), 0) dataset.read_from_path(self.taxon_set1_data_paths[0], "nexml") self.assertEqual(len(dataset.taxon_sets), 1) self.assertEqual(len(taxa), self.taxon_set1_len) for src_path in self.taxon_set1_data_paths: dataset.read_from_path(src_path, "nexml") self.assertEqual(len(dataset.taxon_sets), 1) self.assertEqual(len(taxa), self.taxon_set1_len)
def test_read_single(self): for src_filename, src_matrix_checker_type in self.__class__.srcs: src_path = pathmap.char_source_path(src_filename) ds = dendropy.DataSet() result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 1)) self.assertEqual(len(ds.char_matrices), 1) self.assertEqual(len(ds.taxon_namespaces), 1) self.assertIs(ds.char_matrices[0].taxon_namespace, ds.taxon_namespaces[0]) self.verify_char_matrix(ds.char_matrices[0], src_matrix_checker_type)
def test_construction(self): item_list = [] item_list.extend(self.standalone_taxon_namespaces) item_list.extend(self.expected_tree_lists) item_list.extend(self.expected_char_matrices) ds = dendropy.DataSet(item_list) self.assertEqual(len(ds.taxon_namespaces), len(self.expected_taxon_namespaces)) for x1, x2 in zip(ds.taxon_namespaces, self.expected_taxon_namespaces): self.assertIs(x1, x2) for x1, x2 in zip(ds.tree_lists, self.expected_tree_lists): self.assertIs(x1, x2) for x1, x2 in zip(ds.char_matrices, self.expected_char_matrices): self.assertIs(x1, x2)
def test_basic_add_char_matrix(self): ds = dendropy.DataSet() expected_taxon_namespaces = collections.OrderedDict() for char_matrix in self.expected_char_matrices: ds.add_char_matrix(char_matrix) expected_taxon_namespaces[self.expected_char_matrices[char_matrix]] = True self.assertEqual(len(ds.taxon_namespaces), len(expected_taxon_namespaces)) for x1, x2 in zip(ds.taxon_namespaces, expected_taxon_namespaces): self.assertIs(x1, x2) self.assertEqual(len(ds.char_matrices), len(self.expected_char_matrices)) for x1, x2 in zip(ds.char_matrices, self.expected_char_matrices): self.assertIs(x1, x2) self.assertIs(x1.taxon_namespace, self.expected_char_matrices[x1])
def test_basic_new_tree_list(self): ds = dendropy.DataSet() item_labels = ["a", "b", "c", "d", "e"] item_list = [] for item_idx, item_label in enumerate(item_labels): item = ds.new_tree_list(label=item_label) item_list.append(item) self.assertEqual(len(ds.tree_lists), len(item_labels)) self.assertEqual(len(ds.tree_lists), len(item_list)) for t1, t2, label in zip(ds.tree_lists, item_list, item_labels): self.assertTrue(isinstance(t1, dendropy.TreeList)) self.assertIs(t1, t2) self.assertEqual(t1.label, label)
def test_read_successive_unattached_taxon_namespace(self): ds = dendropy.DataSet() for src_idx, (src_filename, src_matrix_checker_type) in enumerate( self.__class__.srcs): src_path = pathmap.char_source_path(src_filename) result = ds.read(path=src_path, schema="nexus") self.assertEqual(result, (1, 0, 1)) self.assertEqual(len(ds.char_matrices), src_idx + 1) self.assertEqual(len(ds.taxon_namespaces), src_idx + 1) self.assertIs(ds.char_matrices[src_idx].taxon_namespace, ds.taxon_namespaces[src_idx]) self.verify_char_matrix(ds.char_matrices[src_idx], src_matrix_checker_type)
def test_basic_new_taxon_namespace(self): ds = dendropy.DataSet() tax_labels = ["a", "b", "c", "d", "e"] tns_labels = ["t1", "t2", "t3"] tns_list = [] for tns_label in tns_labels: tns = ds.new_taxon_namespace(tax_labels, label=tns_label) self.assertTrue(isinstance(tns, dendropy.TaxonNamespace)) tns_list.append(tns) self.assertEqual(len(tns_list), len(tns_labels)) for tns, tns_label in zip(tns_list, tns_labels): self.assertEqual(tns.label, tns_label) self.assertEqual(len(tns), len(tax_labels)) for taxon, taxon_label in zip(tns, tax_labels): self.assertEqual(taxon.label, taxon_label)