def test_key_levels(self): """DictArray both levels have keys.""" b = DictArrayTemplate("abc", "ABC").wrap(self.a) self.assertEqual(b.keys(), ["a", "b", "c"]) self.assertEqual(b["a"].keys(), ["A", "B", "C"]) self.assertEqual(list(b["a"]), [1, 0, 0]) self.assertEqual(sum(b["a"]), 1)
def test_slicing_combos(self): """different mixtures of slicing should work""" darr = DictArrayTemplate(list(DNA), list(DNA)).wrap([ [0.7, 0.1, 0.2, 0.3], [0.1, 0.7, 0.1, 0.3], [0.3, 0.2, 0.6, 0.3], [0.4, 0.1, 0.1, 0.7], ]) got = darr["C":"G", "C":"G"] assert_allclose(got.array, numpy.array([[0.7, 0.1], [0.2, 0.6]])) got = darr[[1, 2], [1, 2]] assert_allclose(got.array, numpy.array([[0.7, 0.1], [0.2, 0.6]])) got = darr[[2, 3], "C"] assert_allclose(got.array, numpy.array([0.2, 0.1])) got = darr["C", [2, 3]] assert_allclose(got.array, numpy.array([0.1, 0.3])) got = darr[[1, 2], "T":"A"] assert_allclose(got.array, numpy.array([[0.1, 0.7], [0.3, 0.2]])) got = darr["T":"A", [1, 2]] assert_allclose(got.array, numpy.array([[0.1, 0.2], [0.7, 0.1]])) # make sure we cope with keys that are int's nums = list(range(1, 5)) darr = DictArrayTemplate(nums, nums).wrap([ [0.7, 0.1, 0.2, 0.3], [0.1, 0.7, 0.1, 0.3], [0.3, 0.2, 0.6, 0.3], [0.4, 0.1, 0.1, 0.7], ]) got = darr[[1, 2], [1, 2]] assert_allclose(got.array, numpy.array([[0.7, 0.1], [0.2, 0.6]]))
def test_convert2DDict(self): """convert2DDict produces valid template input""" data = dict(a=dict(b=4, c=5)) vals, row_keys, col_keys = convert2DDict(data) self.assertEqual(set(row_keys), set(["a"])) b = DictArrayTemplate(row_keys, col_keys).wrap(vals) self.assertEqual(b.array.tolist(), [[4, 5]]) # row keys, then column self.assertEqual(b.template.names, [["a"], ["b", "c"]]) data = { "a": {"a": 0, "b": 1, "e": 0}, "b": {"a": 1, "b": 0, "e": 4}, "e": {"a": 0, "b": 4, "e": 0}, } vals, row_keys, col_keys = convert2DDict(data) b = DictArrayTemplate(row_keys, col_keys).wrap(vals) got = b.to_dict() self.assertEqual(got, data) self.assertEqual(b.template.names, [["a", "b", "e"], ["a", "b", "e"]]) data = dict(a=dict(b=4, c=5)) vals, row_keys, col_keys = convert2DDict(data, make_symmetric=True) self.assertEqual(row_keys, col_keys) self.assertEqual(vals, [[0, 4, 5], [4, 0, 0], [5, 0, 0]])
def test_get_repr_html(self): """exercising method used by parent classes for nice Jupyter display""" data = [[3, 7], [2, 8], [5, 5]] darr = DictArrayTemplate(list("ABC"), list("ab")).wrap(data) got = darr._repr_html_() self.assertIsInstance(got, str) self.assertTrue(len(got), 100)
def __init__(self, default=None, name=None, dimensions=None, dimension=None, size=None, **kw): assert name if size is not None: pass elif default is not None: size = len(default) elif dimension is not None: size = len(dimension[1]) self.size = size if dimension is not None: self.internal_dimension = dimension (dim_name, dim_cats) = dimension self.bin_names = dim_cats self.array_template = DictArrayTemplate(dim_cats) self.internal_dimensions = (dim_name, ) if default is None: default = self._make_default_value() elif self.array_template is not None: default = self.array_template.unwrap(default) else: default = numpy.asarray(default) _InputDefn.__init__(self, name=name, default=default, dimensions=dimensions, **kw) self.check_value_is_valid(default, True)
def test_convert_1D_dict(self): """convert_1D_dict produces valid template input""" data = dict(a=0, b=35, c=45) vals, keys = convert_1D_dict(data) b = DictArrayTemplate(keys) b = b.wrap(vals) self.assertEqual(b.array.tolist(), [0, 35, 45])
def to_dictarray(self): """construct fully enumerated dictarray Returns ------- DictArray with dtype of int Notes ----- Unobserved combinations have zeros. Result can can be indexed as if it was a numpy array using key values """ from itertools import product from cogent3.util.dict_array import DictArrayTemplate key = next(iter(self)) try: ndim = 1 if isinstance(key, str) else len(key) except TypeError: ndim = 1 if ndim == 1: names = sorted(self) vals = [self[n] for n in names] darr = DictArrayTemplate(names).wrap(vals, dtype=int) return darr categories = [sorted(set(labels)) for labels in zip(*self)] shape = tuple(len(c) for c in categories) darr = DictArrayTemplate(*categories).wrap(numpy.zeros(shape, dtype=int)) for comb in product(*categories): indices = [[categories[i].index(c)] for i, c in enumerate(comb)] darr.array[tuple(indices)] = self[comb] return darr
def test_interpret_index(self): """correctly handles just explicitly defined indices""" n = ["ab", "dna", "rna"] a1D = DictArrayTemplate(n) got = a1D.interpret_index(["ab", "rna"]) self.assertEqual(got[0], ([0, 2], )) got = a1D.interpret_index([0, 2]) self.assertEqual(got[0], ([0, 2], ))
def test_convert_for_dictarray(self): """successfully delegates when constructed from a DictArray""" a = numpy.identity(3, int) b = DictArrayTemplate("abc", "ABC").wrap(a) vals, row_keys, col_keys = convert_for_dictarray(b) got = DictArrayTemplate(row_keys, col_keys).wrap(vals) self.assertEqual(got.array.tolist(), b.array.tolist()) # the wrap method creates a new array self.assertIsNot(got.array, b.array)
def get_all_rate_matrices(self, calibrated=True): """returns all rate matrices (Q) as a dict, keyed by scope Parameters ---------- calibrated : bool scales the rate matrix by branch length for each edge. If a rate heterogeneity model, then the matrix is further scaled by rate for a bin Returns ------- If a single rate matrix, the key is an empty tuple """ defn = self.defn_for["Q"] rate_het = self.defn_for.get("rate", False) if rate_het: bin_index = rate_het.valid_dimensions.index("bin") bin_names = [k[bin_index] for k in rate_het.index] bin_names = {n: i for i, n in enumerate(bin_names)} bin_index = defn.valid_dimensions.index("bin") else: bin_names = None bin_index = None used_dims = defn.used_dimensions() edge_index = defn.valid_dimensions.index("edge") indices = {defn.valid_dimensions.index(k) for k in used_dims} if not calibrated: indices.add(edge_index) if not calibrated and rate_het: indices.add(bin_index) indices = list(sorted(indices)) result = {} darr_template = DictArrayTemplate(self._motifs, self._motifs) for scope, index in defn.index.items(): q = defn.values[index] # this gives the appropriate Q # from scope we extract only the relevant dimensions key = tuple(numpy.take(scope, indices)) q = q.copy() if not calibrated: length = self.get_param_value("length", edge=scope[edge_index]) if rate_het: bdex = bin_names[scope[bin_index]] rate = rate_het.values[bdex] length *= rate q *= length result[key] = darr_template.wrap(q) if not indices and calibrated: break # single rate matrix return result
def test_convert_series(self): """convert_series produces valid template input""" vals, row_keys, col_keys = convert_series([[4], [5]], ["A", "B"], ["a"]) b = DictArrayTemplate(row_keys, col_keys).wrap(vals) self.assertEqual(b.array.tolist(), [[4], [5]]) data = [[245, 599]] vals, row_keys, col_keys = convert_series(data) b = DictArrayTemplate(row_keys, col_keys).wrap(vals) self.assertEqual(b.array.tolist(), data) vals, row_keys, col_keys = convert_series(data[0]) b = DictArrayTemplate(row_keys, col_keys).wrap(vals) self.assertEqual(b.array.tolist(), data[0])
def test_convert_for_dictarray(self): """convert_for_dictarray correctly delegates""" b = DictArrayTemplate("abc", "ABC").wrap(self.a) data_types = ( [[245, 599]], dict(a=dict(b=4, c=5)), {("a", "b"): 4, ("a", "c"): 5}, dict(a=0, b=35, c=45), b, ) for data in data_types: vals, row_keys, col_keys = convert_for_dictarray(data) _ = DictArrayTemplate(row_keys, col_keys).wrap(vals)
def test_write(self): """exercising write method""" data = [[3, 7], [2, 8], [5, 5]] darr = DictArrayTemplate(list("ABC"), list("ab")).wrap(data) with TemporaryDirectory(dir=".") as dirname: outpath = os.path.join(dirname, "delme.tsv") darr.write(outpath) with open(outpath) as infile: contents = [l.strip().split() for l in infile] header = contents.pop(0) self.assertEqual(header, ["dim-1", "dim-2", "value"]) got = {(k1, k2): int(v) for k1, k2, v in contents} self.assertEqual(got, darr.to_dict(flatten=True))
def reconstruct_ancestral_seqs(self, locus=None): """computes the conditional probabilities of each state for each node in the tree. Parameters ---------- locus a named locus Returns ------- {node_name: DictArray, ...} Notes ----- Alignment columns are rows in the DictArray. """ result = {} array_template = None for restricted_edge in self._tree.get_edge_vector(): if restricted_edge.istip(): continue try: r = [] for motif in range(len(self._motifs)): self.set_param_rule( "fixed_motif", value=motif, edge=restricted_edge.name, locus=locus, is_constant=True, ) likelihoods = self.get_full_length_likelihoods(locus=locus) r.append(likelihoods) if array_template is None: array_template = DictArrayTemplate( likelihoods.shape[0], self._motifs ) finally: self.set_param_rule( "fixed_motif", value=-1, edge=restricted_edge.name, locus=locus, is_constant=True, ) # dict of site x motif arrays result[restricted_edge.name] = array_template.wrap( numpy.transpose(numpy.asarray(r)) ) return result
def get_rate_matrix_for_edge(self, name, calibrated=True, **kw): """returns the rate matrix (Q) for the named edge Parameters ---------- name : str name of the edge calibrated : bool If True, the rate matrix is scaled such that ``sum(pi_i * Qii) == 1``. If False, the calibrated matrix is multiplied by the length parameter (and the rate parameter for a bin if it is a rate heterogeneity model). Notes ----- If ``calibrated=False``, ``expm(Q)`` will give the same result as ``self.get_psub_for_edge(name)`` """ # todo handle case of multiple loci try: array = self.get_param_value("Q", edge=name, **kw) array = array.copy() if not calibrated: length = self.get_param_value("length", edge=name, **kw) array *= length except KeyError as err: if err[0] == "Q" and name != "Q": raise RuntimeError("rate matrix not known by this model") else: raise return DictArrayTemplate(self._motifs, self._motifs).wrap(array)
def _get_motif_probs_by_node_tr(self, edges=None, bin=None, locus=None): """returns motif probs by node for time-reversible models""" mprob_rules = [ r for r in self.get_param_rules() if "mprob" in r["par_name"] ] if len(mprob_rules) > 1 or self.model.mprob_model == "monomers": raise NotImplementedError mprobs = self.get_motif_probs() if len(mprobs) != len(self.motifs): # a Muse and Gaut model expanded = numpy.zeros(len(self.motifs), dtype=float) for i, motif in enumerate(self.motifs): val = 1.0 for b in motif: val *= mprobs[b] expanded[i] = val mprobs = expanded / expanded.sum() else: mprobs = [mprobs[m] for m in self.motifs] edges = [] values = [] for e in self.tree.postorder(): edges.append(e.name) values.append(mprobs) return DictArrayTemplate(edges, self.motifs).wrap(values)
def test_deserialise_tabular_dictarray(self): """correctly deserialises DictArray""" from cogent3.util.dict_array import DictArrayTemplate template = DictArrayTemplate(5, ["id", "foo", "bar"]) data = [ [1, "abc", 11], [2, "bca", 22], [3, "cab", 33], [4, "abc", 44], [5, "bca", 55], ] darr = template.wrap(data) json = darr.to_json() got = deserialise_object(json) self.assertEqual(got.to_dict(), darr.to_dict())
def get_bin_probs(self, locus=None): hmm = self.get_param_value("bindex", locus=locus) lhs = [ self.get_param_value("lh", locus=locus, bin=bin) for bin in self.bin_names ] array = hmm.get_posterior_probs(*lhs) return DictArrayTemplate(self.bin_names, array.shape[1]).wrap(array)
def test_to_string(self): darr = DictArrayTemplate(2, 2).wrap([[3.123456789, 2 * 3.123456789], [3 * 3.123456789, 4 * 3.123456789]]) self.assertEqual( darr.to_string(sep=","), "dim-1,dim-2,value\n0,0,3.123456789\n0,1,6.246913578\n1,0,9.370370367\n1,1,12.493827156", ) self.assertEqual( darr.to_string(), "dim-1\tdim-2\tvalue\n0\t0\t3.123456789\n0\t1\t6.246913578\n1\t0\t9.370370367\n1\t1\t12.493827156", ) self.assertEqual( darr.to_string(sep=" "), "dim-1 dim-2 value\n0 0 3.123456789\n0 1 6.246913578\n1 0 9.370370367\n1 1 12.493827156", )
def get_psub_for_edge(self, name, **kw): """returns the substitution probability matrix for the named edge""" try: # For PartialyDiscretePsubsDefn array = self.get_param_value("dpsubs", edge=name, **kw) except KeyError: array = self.get_param_value("psubs", edge=name, **kw) return DictArrayTemplate(self._motifs, self._motifs).wrap(array)
def get_all_psubs(self): """returns all psubs as a dict keyed by used dimensions""" try: defn = self.defn_for["dsubs"] except KeyError: defn = self.defn_for["psubs"] used_dims = defn.used_dimensions() vdims = defn.valid_dimensions indices = [vdims.index(k) for k in used_dims if k in vdims] result = {} darr_template = DictArrayTemplate(self._motifs, self._motifs) for scope, index in defn.index.items(): psub = defn.values[index] key = tuple(numpy.take(scope, indices)) result[key] = darr_template.wrap(psub) return result
def test_category_counts_from_non_int_arrays(self): """handles object and float numpy array, fails if float""" a = numpy.array([[31, 36], [58, 138]], dtype=object) darr = DictArrayTemplate(["syn", "nsyn"], ["Ts", "Tv"]).wrap(a) got = CategoryCounts(darr) assert_allclose(got.observed.array.tolist(), a.tolist()) for dtype in (object, float): with self.assertRaises(TypeError): a = numpy.array([[31.3, 36], [58, 138]], dtype=dtype) darr = DictArrayTemplate(["syn", "nsyn"], ["Ts", "Tv"]).wrap(a) _ = CategoryCounts(darr) # negative values disallowed with self.assertRaises(ValueError): a = numpy.array([[31, -36], [58, 138]], dtype=int) darr = DictArrayTemplate(["syn", "nsyn"], ["Ts", "Tv"]).wrap(a) _ = CategoryCounts(darr)
def __init__(self, data, motifs, row_indices=None, dtype=None): """ data series of numbers, can be numpy array, CategoryCounter, dict instances row_indices row_indices correspond to original indexes, defaults to length of motif """ # todo validate that motifs are strings and row_indices are ints or # strings # todo change row_indices argument name to row_keys if isinstance(data, numpy.ndarray): some_data = data.any() else: some_data = any(data) if not some_data or len(data) == 0: raise ValueError("Must provide data") try: len(data[0]) except TypeError: ndim = 1 else: ndim = 2 num_elements = len(data) if ndim == 1 else len(data[0]) if num_elements != len(motifs): raise ValueError( f"number of data elements {len(data[0])} != {len(motifs)}") motifs = tuple(motifs) # create template if row_indices is None and ndim == 2: row_indices = len(data) template = DictArrayTemplate(row_indices, motifs) darr = template.wrap(data) try: darr.array.astype(dtype, casting="safe") except TypeError as err: raise ValueError(err) self.__dict__.update(darr.__dict__) self.motifs = motifs self.motif_length = len(motifs[0])
def test_get_logo_missing(self): """copes with positions with no values""" data = [ [0.1, 0.3, 0.5, 0.1], [0.05, 0.8, 0.05, 0.1], [0, 0, 0, 0], [0.7, 0.1, 0.1, 0.1], [0.6, 0.15, 0.05, 0.2], ] data = DictArrayTemplate(5, "ACGT").wrap(data) d = get_logo(data)
def test_get_logo(self): """returns Drawable""" data = [ [0.1, 0.3, 0.5, 0.1], [0.25, 0.25, 0.25, 0.25], [0.05, 0.8, 0.05, 0.1], [0.7, 0.1, 0.1, 0.1], [0.6, 0.15, 0.05, 0.2], ] data = DictArrayTemplate(5, "ACGT").wrap(data) d = get_logo(data)
def test_numpy_ops(self): """DictArray should work properly in numpy operations.""" darr = DictArrayTemplate(list(DNA), list(DNA)).wrap([ [0.7, 0.1, 0.1, 0.1], [0.1, 0.7, 0.1, 0.1], [0.1, 0.1, 0.7, 0.1], [0.1, 0.1, 0.1, 0.7], ]) mprobs = numpy.array([0.25, 0.25, 0.25, 0.25]) assert_allclose(mprobs.dot(darr), [0.25, 0.25, 0.25, 0.25]) assert_allclose(numpy.dot(mprobs, darr), [0.25, 0.25, 0.25, 0.25])
def test_direct_construction(self): """directly construct a dict array""" b = DictArrayTemplate("abc", "ABC").wrap(self.a) data_types = ( [[245, 599]], dict(a=dict(b=4, c=5)), {("a", "b"): 4, ("a", "c"): 5}, dict(a=0, b=35, c=45), b, ) for data in data_types: g = DictArray(data)
def test_getitem(self): """correctly slices""" darr = DictArrayTemplate(list(DNA), list(DNA)).wrap([ [0.7, 0.1, 0.1, 0.1], [0.1, 0.7, 0.1, 0.1], [0.1, 0.1, 0.7, 0.1], [0.1, 0.1, 0.1, 0.7], ]) r = darr[:, "A":"G"] assert_allclose(r.to_array(), [[0.1], [0.1], [0.7], [0.1]]) r = darr[2:, "A":"G"] assert_allclose(r.to_array(), [[0.7], [0.1]])
def test_get_repr_html(self): """exercising method used by parent classes for nice Jupyter display""" data = [[3, 7], [2, 8], [5, 5]] darr = DictArrayTemplate(list("ABC"), list("ab")).wrap(data) got = darr._repr_html_() self.assertIsInstance(got, str) self.assertTrue(len(got), 100) # case where 1D array a = [4, 6, 4, 2] darr = DictArrayTemplate(["A", "C", "G", "T"]).wrap(a) got = darr._repr_html_() self.assertTrue('class="index"' not in got) # case of 3D array d3 = numpy.arange(8).reshape((2, 2, 2)) darr = DictArrayTemplate(2, 2, 2).wrap(d3) got = darr._repr_html_() self.assertIn("3 dimensional", got)
def test_to_dict_nested(self): """DictArray.to_dict() should convert nested DictArray instances to dict's too.""" a = numpy.identity(3, int) b = DictArrayTemplate("abc", "ABC") b = b.wrap(a) self.assertEqual(b.array.tolist(), [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) c = DictArrayTemplate("de", "DE").wrap([[b, b], [b, b]]) self.assertTrue(isinstance(c.to_dict()["d"], dict))