def test_names(self): """names property works""" data = { ("ABAYE2984", "Atu3667"): 0.25, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): 0.25, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): 0.25, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): 0.25, ("BAA10469", "Atu3667"): 0.25, ("BAA10469", "Avin_42730"): 1.85, } names = set() for p in data: names.update(p) darr = DistanceMatrix(data) self.assertEqual(set(darr.names), names) darr = darr.drop_invalid() for n in ("ABAYE2984", "BAA10469"): names.remove(n) self.assertEqual(set(darr.names), names)
def test_deserialise_tabular_distancematrix(self): """correctly deserialises DistanceMatrix""" from cogent3.evolve.fast_distance import DistanceMatrix data = { ("ABAYE2984", "Atu3667"): None, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): None, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): None, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): None, ("BAA10469", "Atu3667"): None, ("BAA10469", "Avin_42730"): 1.85, } dm = DistanceMatrix(data) json = dm.to_json() got = deserialise_object(json) dm_dict = dm.to_dict() got_dict = got.to_dict() for (a, b), dist in dm_dict.items(): if dist is None: assert numpy.isnan(got_dict[a, b]) else: assert_allclose(dist, got_dict[a, b])
def test_build_phylogeny(self): """build a NJ tree""" from cogent3 import make_tree dists = { ("DogFaced", "FlyingFox"): 0.05, ("DogFaced", "FreeTaile"): 0.14, ("DogFaced", "LittleBro"): 0.16, ("DogFaced", "TombBat"): 0.15, ("FlyingFox", "DogFaced"): 0.05, ("FlyingFox", "FreeTaile"): 0.12, ("FlyingFox", "LittleBro"): 0.13, ("FlyingFox", "TombBat"): 0.14, ("FreeTaile", "DogFaced"): 0.14, ("FreeTaile", "FlyingFox"): 0.12, ("FreeTaile", "LittleBro"): 0.09, ("FreeTaile", "TombBat"): 0.1, ("LittleBro", "DogFaced"): 0.16, ("LittleBro", "FlyingFox"): 0.13, ("LittleBro", "FreeTaile"): 0.09, ("LittleBro", "TombBat"): 0.12, ("TombBat", "DogFaced"): 0.15, ("TombBat", "FlyingFox"): 0.14, ("TombBat", "FreeTaile"): 0.1, ("TombBat", "LittleBro"): 0.12, } dists = DistanceMatrix(dists) got = dists.quick_tree(show_progress=False) expect = make_tree( treestring="((TombBat,(DogFaced,FlyingFox)),LittleBro,FreeTaile)") self.assertTrue(expect.same_topology(got))
def test_load_tabular_distance_matrix(self): """correctly loads tabular data for DistanceMatrix""" data = {(0, 0): 0, (0, 1): 4, (1, 0): 4, (1, 1): 0} matrix = DistanceMatrix(data) loader = io_app.load_tabular(sep="\t", as_type="distances") with TemporaryDirectory(dir=".") as dirname: writer = io_app.write_tabular(data_path=dirname, format="tsv") outpath = join(dirname, "delme.tsv") writer.write(matrix, identifier=outpath) new = loader(outpath) self.assertEqual(matrix.to_dict(), new.to_dict())
def test_matrix_dtype(self): """tests DistanceMatrix correctly accepts the data with proper dtype""" data = { ("ABAYE2984", "Atu3667"): None, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): None, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): None, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): None, ("BAA10469", "Atu3667"): None, ("BAA10469", "Avin_42730"): 1.85, } names = set() for p in data: names.update(p) # tests when data has None values and DistanceMatrix using dtype('float') darr = DistanceMatrix(data) self.assertEqual(darr.shape, (4, 4)) self.assertEqual(set(darr.names), names) for (a, b), dist in data.items(): if dist is None: assert numpy.isnan(darr[a, b]) else: assert_allclose(dist, darr[a, b]) data = { ("ABAYE2984", "Atu3667"): "None", ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): None, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): "None", ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): None, ("BAA10469", "Atu3667"): None, ("BAA10469", "Avin_42730"): 1.85, } # tests when data has str values and DistanceMatrix using dtype('float') with self.assertRaises(ValueError): darr = DistanceMatrix(data)
def load(self, path): if type(path) == str: # we use a data store as it's read() handles compression path = SingleReadDataStore(path)[0] try: header, data, title = self._parse(path) except Exception as err: result = NotCompleted("ERROR", self, err.args[0], source=str(path)) if self.as_type == "table": return Table(header, rows=data, title=title) assert data.shape[1] == 3, "Invalid tabular data" if self.as_type == "distances": # records is of the form [ [dim-1, dim-2, value] for entries in DistanceMatrix ] return DistanceMatrix({(e[0], e[1]): e[2] for e in data}) if self.as_type == "motif_counts": return make_motif_counts_from_tabular(data) if self.as_type == "motif_freqs": return make_motif_freqs_from_tabular(data) if self.as_type == "pssm": return make_pssm_from_tabular(data) return None
def test_to_table(self): """converts a distance matrix to a Table""" data = { ("A", "B"): 2, ("A", "C"): 3, ("B", "C"): 1, ("B", "A"): 2, ("C", "A"): 3, ("C", "B"): 1, } darr = DistanceMatrix(data) table = darr.to_table() self.assertEqual(table.shape, (3, 4)) self.assertEqual(table.columns["names"].tolist(), list(darr.names)) self.assertEqual(table["A", "B"], 2) self.assertEqual(table["A", "A"], 0)
def test_write_tabular_distance_matrix(self): """correctly writes tabular data for DistanceMatrix""" data = {(0, 0): 0, (0, 1): 4, (1, 0): 4, (1, 1): 0} matrix = DistanceMatrix(data) loader = io_app.load_tabular(sep="\t") with TemporaryDirectory(dir=".") as dirname: writer = io_app.write_tabular(data_path=dirname, format="tsv") outpath = join(dirname, "delme.tsv") writer.write(matrix, identifier=outpath) new = loader(outpath) # when written to file in tabular form # the loaded table will have dim-1 dim-2 as column labels # and the key-values pairs listed below; in dict form... expected = { 0: { "dim-1": 0, "dim-2": 1, "value": 4 }, 1: { "dim-1": 1, "dim-2": 0, "value": 4 }, } self.assertEqual(expected, new.to_dict())
def calc_distance(self, aln): if self._moltype and self._moltype != aln.moltype: aln = aln.to_moltype(self._moltype) if self.fast_calc: self.fast_calc(aln, show_progress=False) dists = self.fast_calc.get_pairwise_distances() else: empty = {p: 0 for p in itertools.product(aln.names, aln.names)} dists = DistanceMatrix(empty) dists.source = aln.info.source if self._sm: for a in dists.template.names[0]: for b in dists.template.names[1]: if not dists[a, b] and a != b: subset = aln.take_seqs([a, b]) dist = self._est_dist_pair_slow(subset) dists[a, b] = dists[b, a] = dist return dists
def test_take_dists(self): """subsets the distance matrix""" data = { ("ABAYE2984", "Atu3667"): 0.25, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): 0.25, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): 0.25, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): 0.25, ("BAA10469", "Atu3667"): 0.25, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) got1 = darr.take_dists(["ABAYE2984", "Atu3667", "Avin_42730"]) got2 = darr.take_dists("BAA10469", negate=True) assert_allclose(got1.array.astype(float), got2.array.astype(float))
def get_pairwise_distances(self, summary_function="mean", **kwargs): """Return the pairwise distances as a dictionary keyed by (seq1, seq2). Convenience interface to get_pairwise_param. Parameters ---------- summary_function a string naming the function used for estimating param from threeway distances. Valid values are 'mean' (default) and 'median'. """ dists = self.get_pairwise_param( "length", summary_function=summary_function, **kwargs ) return None if not dists else DistanceMatrix(dists)
def test_slice_dmatrix(self): data = { ("ABAYE2984", "Atu3667"): 0.25, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): 0.25, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): 0.25, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): 0.25, ("BAA10469", "Atu3667"): 0.25, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) names = darr.template.names[0][:3] got = darr[:3, :3] self.assertEqual(list(got.template.names[0]), names)
def test_dropping_from_matrix(self): """pairwise distances should have method for dropping invalid data""" data = { ("ABAYE2984", "Atu3667"): None, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): None, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): None, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): None, ("BAA10469", "Atu3667"): None, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) new = darr.drop_invalid() self.assertEqual(new, None) data = { ("ABAYE2984", "Atu3667"): 0.25, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): 0.25, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): 0.25, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): 0.25, ("BAA10469", "Atu3667"): 0.25, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) new = darr.drop_invalid() self.assertEqual(new.shape, (2, 2))
def test_to_dict(self): """distance matrix correctly produces a 1D dict""" data = {("s1", "s2"): 0.25, ("s2", "s1"): 0.25} dmat = DistanceMatrix(data) got = dmat.to_dict() self.assertEqual(got, data)
def test_quick_tree_taking_distance_matrix(self): """quick_tree should take a distance matrix""" quick_tree = tree_app.quick_tree() data = { ("ABAYE2984", "Avin_42730"): 0.638, ("Atu3667", "Avin_42730"): 2.368, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) tree = quick_tree.quick_tree(darr) self.assertIsInstance(tree, PhyloNode) self.assertIsNotNone(tree.children) self.assertEqual(set(tree.get_tip_names()), set.union(*(set(tup) for tup in data.keys()))) data = { ("DogFaced", "FlyingFox"): 0.05, ("DogFaced", "FreeTaile"): 0.14, ("DogFaced", "LittleBro"): 0.16, ("DogFaced", "TombBat"): 0.15, ("FlyingFox", "DogFaced"): 0.05, ("FlyingFox", "FreeTaile"): 0.12, ("FlyingFox", "LittleBro"): 0.13, ("FlyingFox", "TombBat"): 0.14, ("FreeTaile", "DogFaced"): 0.14, ("FreeTaile", "FlyingFox"): 0.12, ("FreeTaile", "LittleBro"): 0.09, ("FreeTaile", "TombBat"): 0.1, ("LittleBro", "DogFaced"): 0.16, ("LittleBro", "FlyingFox"): 0.13, ("LittleBro", "FreeTaile"): 0.09, ("LittleBro", "TombBat"): 0.12, ("TombBat", "DogFaced"): 0.15, ("TombBat", "FlyingFox"): 0.14, ("TombBat", "FreeTaile"): 0.1, ("TombBat", "LittleBro"): 0.12, } darr = DistanceMatrix(data) tree = quick_tree.quick_tree(darr) self.assertIsInstance(tree, PhyloNode) self.assertIsNotNone(tree.children) self.assertEqual(set(tree.get_tip_names()), set.union(*(set(tup) for tup in data.keys()))) data = { ("ABAYE2984", "Atu3667"): 0.25, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): 0.25, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): 0.25, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): 0.25, ("BAA10469", "Atu3667"): 0.25, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) tree = quick_tree.quick_tree(darr) self.assertIsInstance(tree, PhyloNode) self.assertIsNotNone(tree.children) self.assertEqual(set(tree.get_tip_names()), set.union(*(set(tup) for tup in data.keys()))) data = { ("ABAYE2984", "Atu3667"): None, ("ABAYE2984", "Avin_42730"): 0.638, ("ABAYE2984", "BAA10469"): None, ("Atu3667", "ABAYE2984"): None, ("Atu3667", "Avin_42730"): 2.368, ("Atu3667", "BAA10469"): None, ("Avin_42730", "ABAYE2984"): 0.638, ("Avin_42730", "Atu3667"): 2.368, ("Avin_42730", "BAA10469"): 1.85, ("BAA10469", "ABAYE2984"): None, ("BAA10469", "Atu3667"): None, ("BAA10469", "Avin_42730"): 1.85, } darr = DistanceMatrix(data) with self.assertRaises(KeyError): tree = quick_tree.quick_tree(darr) # when distance_matrix is None after dropping invalid with self.assertRaises(ValueError): quick_tree = tree_app.quick_tree(drop_invalid=True) tree = quick_tree.quick_tree(darr) data = { ("DogFaced", "FlyingFox"): 0.05, ("DogFaced", "FreeTaile"): 0.14, ("DogFaced", "LittleBro"): 0.16, ("DogFaced", "TombBat"): 0.15, ("FlyingFox", "DogFaced"): 0.05, ("FlyingFox", "FreeTaile"): 0.12, ("FlyingFox", "LittleBro"): 0.13, ("FlyingFox", "TombBat"): 0.14, ("FreeTaile", "DogFaced"): 0.14, ("FreeTaile", "FlyingFox"): 0.12, ("FreeTaile", "LittleBro"): 0.09, ("FreeTaile", "TombBat"): 0.1, ("LittleBro", "DogFaced"): 0.16, ("LittleBro", "FlyingFox"): 0.13, ("LittleBro", "FreeTaile"): 0.09, ("LittleBro", "TombBat"): 0.12, ("TombBat", "DogFaced"): 0.15, ("TombBat", "FlyingFox"): 0.14, ("TombBat", "FreeTaile"): 0.1, ("TombBat", "LittleBro"): 0.12, } darr = DistanceMatrix(data) tree = quick_tree.quick_tree(darr) self.assertIsInstance(tree, PhyloNode) self.assertIsNotNone(tree.children) self.assertEqual(set(tree.get_tip_names()), set.union(*(set(tup) for tup in data.keys())))