Esempio n. 1
0
 def test_convert_1D_dict(self):
     """convert_1D_dict produces valid template input"""
     data = dict(a=0, b=35, c=45)
     vals, keys = convert_1D_dict(data)
     b = DictArrayTemplate(keys)
     b = b.wrap(vals)
     self.assertEqual(b.array.tolist(), [0, 35, 45])
Esempio n. 2
0
 def test_to_dict_nested(self):
     """DictArray.to_dict() should convert nested DictArray instances to
     dict's too."""
     a = numpy.identity(3, int)
     b = DictArrayTemplate("abc", "ABC")
     b = b.wrap(a)
     self.assertEqual(b.array.tolist(), [[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     c = DictArrayTemplate("de", "DE").wrap([[b, b], [b, b]])
     self.assertTrue(isinstance(c.to_dict()["d"], dict))
Esempio n. 3
0
    def get_all_rate_matrices(self, calibrated=True):
        """returns all rate matrices (Q) as a dict, keyed by scope

        Parameters
        ----------
        calibrated : bool
            scales the rate matrix by branch length for each edge. If a rate
            heterogeneity model, then the matrix is further scaled by rate
            for a bin
        Returns
        -------
        If a single rate matrix, the key is an empty tuple
        """
        defn = self.defn_for["Q"]

        rate_het = self.defn_for.get("rate", False)
        if rate_het:
            bin_index = rate_het.valid_dimensions.index("bin")
            bin_names = [k[bin_index] for k in rate_het.index]
            bin_names = {n: i for i, n in enumerate(bin_names)}
            bin_index = defn.valid_dimensions.index("bin")
        else:
            bin_names = None
            bin_index = None

        used_dims = defn.used_dimensions()
        edge_index = defn.valid_dimensions.index("edge")

        indices = {defn.valid_dimensions.index(k) for k in used_dims}
        if not calibrated:
            indices.add(edge_index)

        if not calibrated and rate_het:
            indices.add(bin_index)

        indices = list(sorted(indices))
        result = {}
        darr_template = DictArrayTemplate(self._motifs, self._motifs)
        for scope, index in defn.index.items():
            q = defn.values[index]  # this gives the appropriate Q
            # from scope we extract only the relevant dimensions
            key = tuple(numpy.take(scope, indices))
            q = q.copy()
            if not calibrated:
                length = self.get_param_value("length", edge=scope[edge_index])
                if rate_het:
                    bdex = bin_names[scope[bin_index]]
                    rate = rate_het.values[bdex]
                    length *= rate
                q *= length
            result[key] = darr_template.wrap(q)
            if not indices and calibrated:
                break  # single rate matrix

        return result
Esempio n. 4
0
    def reconstruct_ancestral_seqs(self, locus=None):
        """computes the conditional probabilities of each state for each node
        in the tree.

        Parameters
        ----------
        locus
            a named locus

        Returns
        -------
        {node_name: DictArray, ...}

        Notes
        -----
        Alignment columns are rows in the DictArray.
        """
        result = {}
        array_template = None
        for restricted_edge in self._tree.get_edge_vector():
            if restricted_edge.istip():
                continue
            try:
                r = []
                for motif in range(len(self._motifs)):
                    self.set_param_rule(
                        "fixed_motif",
                        value=motif,
                        edge=restricted_edge.name,
                        locus=locus,
                        is_constant=True,
                    )
                    likelihoods = self.get_full_length_likelihoods(locus=locus)
                    r.append(likelihoods)
                    if array_template is None:
                        array_template = DictArrayTemplate(
                            likelihoods.shape[0], self._motifs
                        )
            finally:
                self.set_param_rule(
                    "fixed_motif",
                    value=-1,
                    edge=restricted_edge.name,
                    locus=locus,
                    is_constant=True,
                )
            # dict of site x motif arrays
            result[restricted_edge.name] = array_template.wrap(
                numpy.transpose(numpy.asarray(r))
            )
        return result
Esempio n. 5
0
    def test_deserialise_tabular_dictarray(self):
        """correctly deserialises DictArray"""
        from cogent3.util.dict_array import DictArrayTemplate

        template = DictArrayTemplate(5, ["id", "foo", "bar"])
        data = [
            [1, "abc", 11],
            [2, "bca", 22],
            [3, "cab", 33],
            [4, "abc", 44],
            [5, "bca", 55],
        ]
        darr = template.wrap(data)
        json = darr.to_json()
        got = deserialise_object(json)
        self.assertEqual(got.to_dict(), darr.to_dict())
Esempio n. 6
0
    def get_all_psubs(self):
        """returns all psubs as a dict keyed by used dimensions"""
        try:
            defn = self.defn_for["dsubs"]
        except KeyError:
            defn = self.defn_for["psubs"]

        used_dims = defn.used_dimensions()
        vdims = defn.valid_dimensions
        indices = [vdims.index(k) for k in used_dims if k in vdims]
        result = {}
        darr_template = DictArrayTemplate(self._motifs, self._motifs)
        for scope, index in defn.index.items():
            psub = defn.values[index]
            key = tuple(numpy.take(scope, indices))
            result[key] = darr_template.wrap(psub)
        return result
Esempio n. 7
0
    def __init__(self, data, motifs, row_indices=None, dtype=None):
        """
        data
            series of numbers, can be numpy array, CategoryCounter, dict instances
        row_indices
            row_indices correspond to original indexes, defaults to length of
            motif
        """
        # todo validate that motifs are strings and row_indices are ints or
        # strings
        # todo change row_indices argument name to row_keys
        if isinstance(data, numpy.ndarray):
            some_data = data.any()
        else:
            some_data = any(data)

        if not some_data or len(data) == 0:
            raise ValueError("Must provide data")

        try:
            len(data[0])
        except TypeError:
            ndim = 1
        else:
            ndim = 2
        num_elements = len(data) if ndim == 1 else len(data[0])
        if num_elements != len(motifs):
            raise ValueError(
                f"number of data elements {len(data[0])} != {len(motifs)}")
        motifs = tuple(motifs)

        # create template
        if row_indices is None and ndim == 2:
            row_indices = len(data)

        template = DictArrayTemplate(row_indices, motifs)
        darr = template.wrap(data)
        try:
            darr.array.astype(dtype, casting="safe")
        except TypeError as err:
            raise ValueError(err)
        self.__dict__.update(darr.__dict__)
        self.motifs = motifs
        self.motif_length = len(motifs[0])
Esempio n. 8
0
    def get_all_rate_matrices(self, calibrated=True):
        """returns all rate matrices (Q) as a dict, keyed by scope

        Parameters
        ----------
        calibrated : bool
            If True, the rate matrix is scaled such that
            ``sum(pi_i * Qii) == 1``. If False, the calibrated matrix is
            multiplied by the length parameter (and the rate parameter for a
            bin if it is a rate heterogeneity model).

        Returns
        -------
        {scope: DictArray, ...}

        Notes
        -----
        If a single rate matrix (e.g. it's a time-homogeneous model), the key
        is an empty tuple.
        """
        defn = self.defn_for["Q"]

        rate_het = self.defn_for.get("rate", False)
        if rate_het:
            bin_index = rate_het.valid_dimensions.index("bin")
            bin_names = [k[bin_index] for k in rate_het.index]
            bin_names = {n: i for i, n in enumerate(bin_names)}
            bin_index = defn.valid_dimensions.index("bin")
        else:
            bin_names = None
            bin_index = None

        used_dims = defn.used_dimensions()
        edge_index = defn.valid_dimensions.index("edge")

        indices = {defn.valid_dimensions.index(k) for k in used_dims}
        if not calibrated:
            indices.add(edge_index)

        if not calibrated and rate_het:
            indices.add(bin_index)

        indices = list(sorted(indices))
        result = {}
        darr_template = DictArrayTemplate(self._motifs, self._motifs)
        for scope, index in defn.index.items():
            q = defn.values[index]  # this gives the appropriate Q
            # from scope we extract only the relevant dimensions
            key = tuple(numpy.take(scope, indices))
            q = q.copy()
            if not calibrated:
                length = self.get_param_value("length", edge=scope[edge_index])
                if rate_het:
                    bdex = bin_names[scope[bin_index]]
                    rate = rate_het.values[bdex]
                    length *= rate
                q *= length
            result[key] = darr_template.wrap(q)
            if not indices and calibrated:
                break  # single rate matrix

        return result