Esempio n. 1
0
    def __init__(self,
                 vars,
                 var_values,
                 data,
                 factors=None,
                 alpha=0.1):
        """
        vars = the sequence of feature ids
        var_values = the sequence of feature values
        alpha = smoothing parameter
        data = the data slice (2d ndarray) upon which to grow a cltree
        factors = the already computed factors (this is when the model has already been conputed)
        """
        Node.__init__(self, frozenset(vars))

        self.vars = numpy.array(vars)

        self._alpha = alpha
        #
        # assuming all variables to be homogeneous
        # TODO: generalize this
        self._n_var_vals = var_values[0]
        self.var_values = numpy.array(var_values)

        #
        # assuming data is never None
        self._data = data
        self._cltree = CLTree(data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              sparse=True,
                              mem_free=True)
Esempio n. 2
0
def test_compute_joint_probs():
    #
    # creating the tree upon synthetic data
    clt = CLTree(data=data, alpha=0.0, sparse=True, mem_free=False)
    joint_counts = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for instance in data:
                    joint_counts[i, j, instance[i], instance[j]] += 1

    print('Computed co freqs')
    print(joint_counts)
    print(clt._joint_freqs)

    assert_almost_equal(clt._joint_freqs, joint_counts)
    #
    # going to logs
    joint_probs = joint_counts / n_instances
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO
    #
    # to have a complete match the diagonal entries are left to zero
    for i in range(n_features):
        log_joint_probs[i, i] = 0

    assert_almost_equal(log_joint_probs, clt._log_joint_probs)
    #
    # dense case
    clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False)
    assert_almost_equal(log_joint_probs, clt._log_joint_probs)

    #
    # changing alpha
    alpha = 1.0
    joint_probs = (joint_counts + alpha) / (n_instances + 4.0 * alpha)
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO
    #
    # to have a complete match the diagonal entries are left to zero
    for i in range(n_features):
        log_joint_probs[i, i] = 0

    clt = CLTree(data=data, alpha=alpha, sparse=True, mem_free=False)
    assert_almost_equal(log_joint_probs, clt._log_joint_probs)

    #
    # doing a dense version
    clt = CLTree(data=data, alpha=alpha, sparse=False, mem_free=False)
    assert_almost_equal(log_joint_probs, clt._log_joint_probs)
Esempio n. 3
0
    def __init__(self,
                 vars,
                 var_values,
                 data,
                 factors=None,
                 alpha=0.1):
        """
        vars = the sequence of feature ids
        var_values = the sequence of feature values
        alpha = smoothing parameter
        data = the data slice (2d ndarray) upon which to grow a cltree
        factors = the already computed factors (this is when the model has already been conputed)
        """
        Node.__init__(self, frozenset(vars))

        self.vars = numpy.array(vars)

        self._alpha = alpha
        #
        # assuming all variables to be homogeneous
        # TODO: generalize this
        self._n_var_vals = var_values[0]
        self.var_values = numpy.array(var_values)

        #
        # assuming data is never None
        self._data = data
        self._cltree = CLTree(data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              sparse=True,
                              mem_free=True)
Esempio n. 4
0
    def smooth_probs(self, alpha, data=None):
        """
        The only waya to smooth here is to rebuild the whole tree
        """
        self._alpha = alpha

        if data is not None:
            self._data = data
        # else:
        #     raise ValueError('Cannot smooth without data')

        self._cltree = CLTree(data=self._data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              # copy_mi=False,
                              sparse=True,
                              mem_free=True)
Esempio n. 5
0
def test_eval_instance():
    #
    # comparing against values taken from Nico's code
    nico_cltree_tree = numpy.array([-1, 2, 0, 2])
    nico_cltree_tree[0] = 0
    nico_cltree_lls = numpy.array([
        -2.01490302054, -1.20397280433, -1.20397280433, -1.79175946923,
        -1.60943791243, -1.60943791243
    ])

    nico_cltree_subtree = numpy.array([-1, 0, 1])
    nico_cltree_subtree[0] = 0
    nico_cltree_sublls = numpy.array([
        -1.09861228867, -0.69314718056, -0.69314718056, -1.79175946923,
        -1.09861228867, -0.69314718056
    ])
    #
    # growing the tree on data
    clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False)
    print(clt)

    # assert_array_equal(nico_cltree_tree,
    #                    clt._tree)
    for i, instance in enumerate(data):
        ll = clt.eval(instance)
        ll_f = clt.eval_fact(instance)
        # assert_almost_equal(nico_cltree_lls[i], ll)
        assert_almost_equal(ll, ll_f)
        print(ll, nico_cltree_lls[i])

    #
    # now by obscuring one column
    subdata = data[:, [0, 2, 3]]
    subclt = CLTree(subdata,
                    features=numpy.array([0, 2, 3]),
                    alpha=0.0,
                    sparse=True,
                    mem_free=False)
    print(subclt)

    # assert_array_equal(nico_cltree_subtree,
    #                    subclt._tree)
    for i, instance in enumerate(data):
        ll = subclt.eval(instance)
        ll_f = subclt.eval_fact(instance)
        assert_almost_equal(ll, ll_f)
        print(ll, nico_cltree_sublls[i])
Esempio n. 6
0
def test_compute_probs_dense():

    #
    # creating the tree upon synthetic data
    clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False)

    counts_1 = data.sum(axis=0)
    counts_0 = n_instances - counts_1
    counts = numpy.column_stack([counts_0, counts_1])
    probs = counts / n_instances
    print('Computed marg freqs and probs', counts, probs)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    print('Computed marg logs for alpha=0', log_probs)
    print('CLT marg logs for alpha=0', clt._log_marg_probs)

    assert_almost_equal(clt._log_marg_probs, log_probs)
    assert_almost_equal(clt._marg_freqs, counts_1)

    #
    # now with another value for alpha
    alpha = 1.0
    clt = CLTree(data=data, alpha=alpha, sparse=False, mem_free=False)
    probs = (counts + 2 * alpha) / (n_instances + 4 * alpha)
    print('Computed probs with alpha=', alpha, probs)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    print('Computed marg logs for alpha=', alpha, log_probs)
    print('CLT marg logs for alpha=', alpha, clt._log_marg_probs)

    assert_almost_equal(clt._log_marg_probs, log_probs)
    assert_almost_equal(clt._marg_freqs, counts_1)
Esempio n. 7
0
    def smooth_probs(self, alpha, data=None):
        """
        The only waya to smooth here is to rebuild the whole tree
        """
        self._alpha = alpha

        if data is not None:
            self._data = data
        # else:
        #     raise ValueError('Cannot smooth without data')

        self._cltree = CLTree(data=self._data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              # copy_mi=False,
                              sparse=True,
                              mem_free=True)
Esempio n. 8
0
def test_eval_instance():
    #
    # comparing against values taken from Nico's code
    nico_cltree_tree = numpy.array([-1,  2,  0,  2])
    nico_cltree_tree[0] = 0
    nico_cltree_lls = numpy.array([-2.01490302054,
                                   -1.20397280433,
                                   -1.20397280433,
                                   -1.79175946923,
                                   -1.60943791243,
                                   -1.60943791243])

    nico_cltree_subtree = numpy.array([-1,  0,  1])
    nico_cltree_subtree[0] = 0
    nico_cltree_sublls = numpy.array([-1.09861228867,
                                      -0.69314718056,
                                      -0.69314718056,
                                      -1.79175946923,
                                      -1.09861228867,
                                      -0.69314718056])
    #
    # growing the tree on data
    clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False)
    print(clt)

    # assert_array_equal(nico_cltree_tree,
    #                    clt._tree)
    for i, instance in enumerate(data):
        ll = clt.eval(instance)
        ll_f = clt.eval_fact(instance)
        # assert_almost_equal(nico_cltree_lls[i], ll)
        assert_almost_equal(ll, ll_f)
        print(ll, nico_cltree_lls[i])

    #
    # now by obscuring one column
    subdata = data[:, [0, 2, 3]]
    subclt = CLTree(subdata,
                    features=numpy.array([0, 2, 3]),
                    alpha=0.0, sparse=True, mem_free=False)
    print(subclt)

    # assert_array_equal(nico_cltree_subtree,
    #                    subclt._tree)
    for i, instance in enumerate(data):
        ll = subclt.eval(instance)
        ll_f = subclt.eval_fact(instance)
        assert_almost_equal(ll, ll_f)
        print(ll, nico_cltree_sublls[i])
Esempio n. 9
0
def test_compute_mi():
    counts_1 = data.sum(axis=0)
    counts_0 = n_instances - counts_1
    counts = numpy.column_stack([counts_0, counts_1])
    probs = counts / n_instances
    print('Computed marg freqs and probs', counts, probs)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    log_prods = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for k in range(2):
                    for h in range(2):
                        log_prods[i, j, k, h] = \
                            log_probs[i, k] + log_probs[j, h]

    joint_counts = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for instance in data:
                    joint_counts[i, j, instance[i], instance[j]] += 1

    print('Computed co freqs')
    print(joint_counts)

    #
    # going to logs
    joint_probs = joint_counts / n_instances
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO

    for i in range(n_features):
        log_joint_probs[i, i] = 0

    mutual_info = numpy.exp(log_joint_probs) * (log_joint_probs - log_prods)
    mutual_info = mutual_info.sum(axis=2).sum(axis=2)

    print('Computed MI:', mutual_info, type(mutual_info))

    clt = CLTree(data, alpha=0.0, sparse=False, mem_free=False)
    print('CLTree', clt._mutual_info, type(clt._mutual_info))

    assert_almost_equal(mutual_info, clt._mutual_info)

    #
    # adding sparsity
    clt = CLTree(data, alpha=0.0, sparse=True, mem_free=False)
    assert_almost_equal(mutual_info, clt._mutual_info)

    #
    # now with alpha
    alpha = 0.5
    probs = (counts + 2 * alpha) / (n_instances + 4 * alpha)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    log_prods = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for k in range(2):
                    for h in range(2):
                        log_prods[i, j, k, h] = \
                            log_probs[i, k] + log_probs[j, h]

    joint_probs = (joint_counts + alpha) / (n_instances + 4 * alpha)
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO

    for i in range(n_features):
        log_joint_probs[i, i] = 0

    mutual_info = numpy.exp(log_joint_probs) * (log_joint_probs - log_prods)
    mutual_info = mutual_info.sum(axis=2).sum(axis=2)

    clt = CLTree(data, alpha=alpha, sparse=False, mem_free=False)
    assert_almost_equal(mutual_info, clt._mutual_info)

    clt = CLTree(data, alpha=alpha, sparse=True, mem_free=False)
    assert_almost_equal(mutual_info, clt._mutual_info)
Esempio n. 10
0
def test_compute_cond_probs():
    #
    # creating the tree upon synthetic data
    clt = CLTree(data=data, alpha=0.0, sparse=False, mem_free=False)

    # cond_probs = numpy.zeros((n_features,
    #                           n_features,
    #                           2,
    #                           2))
    # for i in range(n_features):
    #     for j in range(n_features):
    #         if i != j:
    #             for instance in data:
    #                 cond_probs[i, j, instance[i], instance[j]] += 1
    #             #
    #             # now normalizing
    #             sums = cond_probs[i, j].sum(axis=0)
    #             print('sums', sums)
    #             cond_probs[i, j] /= sums

    # cond_probs[numpy.isnan(cond_probs)] = 0
    # print ('Computed cond probs', cond_probs)
    # log_cond_probs = numpy.log(cond_probs)
    # log_cond_probs[numpy.isinf(log_cond_probs)] = LOG_ZERO

    # for i in range(n_features):
    #     log_cond_probs[i, i] = 0

    # print(log_cond_probs)
    # print('logs\n', clt._log_cond_probs)
    # assert_almost_equal(log_cond_probs, clt._log_cond_probs)

    joint_counts = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for instance in data:
                    joint_counts[i, j, instance[i], instance[j]] += 1

    print('Computed co freqs')
    print(joint_counts)
    print(clt._joint_freqs)

    assert_almost_equal(clt._joint_freqs, joint_counts)

    #
    # checking sparseness
    clt = CLTree(data=data, alpha=0.0, sparse=True, mem_free=False)
    assert_almost_equal(clt._joint_freqs, joint_counts)

    #
    # going to logs
    joint_probs = joint_counts / n_instances
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO
    #
    # to have a complete match the diagonal entries are left to zero
    for i in range(n_features):
        log_joint_probs[i, i] = 0

    counts_1 = data.sum(axis=0)
    counts_0 = n_instances - counts_1
    counts = numpy.column_stack([counts_0, counts_1])
    probs = counts / n_instances
    print('Computed marg freqs and probs', counts, probs)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    log_cond_probs = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for k in range(2):
                    for h in range(2):
                        log_cond_probs[i, j, k, h] = \
                            log_joint_probs[i, j, k, h] - log_probs[j, h]

    print('Computed log cond probs with alpha=0', log_cond_probs)
    assert_array_almost_equal(log_cond_probs, clt._log_cond_probs)

    #
    # testing factors
    print('\nTesting factors')
    for i in range(n_features):
        parent_id = clt._tree[i]
        for k in range(2):
            for h in range(2):
                if i != parent_id:
                    print(clt._log_cond_probs[i, parent_id, k, h],
                          clt._factors[i, k, h])
                    assert_array_almost_equal(
                        clt._log_cond_probs[i, parent_id, k, h],
                        clt._factors[i, k, h])

    alpha = 2.0
    clt = CLTree(data, alpha=alpha, sparse=False, mem_free=False)
    #
    # going to logs
    joint_probs = (joint_counts + alpha) / (n_instances + 4 * alpha)
    log_joint_probs = numpy.log(joint_probs)
    log_joint_probs[numpy.isinf(log_joint_probs)] = LOG_ZERO
    #
    # to have a complete match the diagonal entries are left to zero
    for i in range(n_features):
        log_joint_probs[i, i] = 0

    counts_1 = data.sum(axis=0)
    counts_0 = n_instances - counts_1
    counts = numpy.column_stack([counts_0, counts_1])
    probs = (counts + 2 * alpha) / (n_instances + 4 * alpha)
    print('Computed marg freqs and probs', counts, probs)

    log_probs = numpy.log(probs)
    log_probs[numpy.isinf(log_probs)] = LOG_ZERO

    log_cond_probs = numpy.zeros((n_features, n_features, 2, 2))
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                for k in range(2):
                    for h in range(2):
                        log_cond_probs[i, j, k, h] = \
                            log_joint_probs[i, j, k, h] - log_probs[j, h]

    print('Computed log cond probs with alpha=0', log_cond_probs)
    assert_array_almost_equal(log_cond_probs, clt._log_cond_probs)

    #
    # sparse version
    clt = CLTree(data, alpha=alpha, sparse=True, mem_free=False)
    assert_array_almost_equal(log_cond_probs, clt._log_cond_probs)
Esempio n. 11
0
class CLTreeNode(Node):

    """
    An input node representing a Chow-Liu Tree over a set of r.v.
    """

    def __init__(self,
                 vars,
                 var_values,
                 data,
                 factors=None,
                 alpha=0.1):
        """
        vars = the sequence of feature ids
        var_values = the sequence of feature values
        alpha = smoothing parameter
        data = the data slice (2d ndarray) upon which to grow a cltree
        factors = the already computed factors (this is when the model has already been conputed)
        """
        Node.__init__(self, frozenset(vars))

        self.vars = numpy.array(vars)

        self._alpha = alpha
        #
        # assuming all variables to be homogeneous
        # TODO: generalize this
        self._n_var_vals = var_values[0]
        self.var_values = numpy.array(var_values)

        #
        # assuming data is never None
        self._data = data
        self._cltree = CLTree(data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              sparse=True,
                              mem_free=True)

    def smooth_probs(self, alpha, data=None):
        """
        The only waya to smooth here is to rebuild the whole tree
        """
        self._alpha = alpha

        if data is not None:
            self._data = data
        # else:
        #     raise ValueError('Cannot smooth without data')

        self._cltree = CLTree(data=self._data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              # copy_mi=False,
                              sparse=True,
                              mem_free=True)

    def eval(self, obs):
        """
        Dispatching inference to the cltree
        """
        #
        # TODO: do something for the derivatives
        self.log_der = LOG_ZERO

        # self.log_val = self._cltree.eval(obs)
        self.log_val = self._cltree.eval_fact(obs)

    def mpe_eval(self, obs):
        """
        WRITEME
        """
        raise NotImplementedError('MPE inference not yet implemented')

    def n_children(self):
        return len(self.vars)

    def node_type_str(self):
        return CHOW_LIU_TREE_NODE_SYM

    def node_short_str(self):
        vars_str = ','.join([var for var in self.vars])
        return "{type} {id}" +\
            " <{vars}>" +\
            " {tree} {factors}".format(type=self.node_type_str(),
                                       id=self.id,
                                       vars=vars_str,
                                       tree=self._cltree.tree_repr(),
                                       factors=self._cltree.factors_repr())

    def __repr__(self):
        """
        WRITEME
        """
        base = Node.__repr__(self)

        return ("""CLTree Smoothed Node {line1}
            vars: {vars} vals: {vals} tree:{tree}""".
                format(line1=base,
                       vars=self.vars,
                       vals=self._n_var_vals,
                       tree=self._cltree.tree_repr()))
Esempio n. 12
0
class CLTreeNode(Node):

    """
    An input node representing a Chow-Liu Tree over a set of r.v.
    """

    def __init__(self,
                 vars,
                 var_values,
                 data,
                 factors=None,
                 alpha=0.1):
        """
        vars = the sequence of feature ids
        var_values = the sequence of feature values
        alpha = smoothing parameter
        data = the data slice (2d ndarray) upon which to grow a cltree
        factors = the already computed factors (this is when the model has already been conputed)
        """
        Node.__init__(self, frozenset(vars))

        self.vars = numpy.array(vars)

        self._alpha = alpha
        #
        # assuming all variables to be homogeneous
        # TODO: generalize this
        self._n_var_vals = var_values[0]
        self.var_values = numpy.array(var_values)

        #
        # assuming data is never None
        self._data = data
        self._cltree = CLTree(data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              sparse=True,
                              mem_free=True)

    def smooth_probs(self, alpha, data=None):
        """
        The only waya to smooth here is to rebuild the whole tree
        """
        self._alpha = alpha

        if data is not None:
            self._data = data
        # else:
        #     raise ValueError('Cannot smooth without data')

        self._cltree = CLTree(data=self._data,
                              features=self.vars,
                              n_feature_vals=self._n_var_vals,
                              feature_vals=self.var_values,
                              alpha=alpha,
                              # copy_mi=False,
                              sparse=True,
                              mem_free=True)

    def eval(self, obs):
        """
        Dispatching inference to the cltree
        """
        #
        # TODO: do something for the derivatives
        self.log_der = LOG_ZERO

        # self.log_val = self._cltree.eval(obs)
        self.log_val = self._cltree.eval_fact(obs)

    def mpe_eval(self, obs):
        """
        WRITEME
        """
        raise NotImplementedError('MPE inference not yet implemented')

    def n_children(self):
        return len(self.vars)

    def node_type_str(self):
        return CHOW_LIU_TREE_NODE_SYM

    def node_short_str(self):
        vars_str = ','.join([var for var in self.vars])
        return "{type} {id}" +\
            " <{vars}>" +\
            " {tree} {factors}".format(type=self.node_type_str(),
                                       id=self.id,
                                       vars=vars_str,
                                       tree=self._cltree.tree_repr(),
                                       factors=self._cltree.factors_repr())

    def __repr__(self):
        """
        WRITEME
        """
        base = Node.__repr__(self)

        return ("""CLTree Smoothed Node {line1}
            vars: {vars} vals: {vals} tree:{tree}""".
                format(line1=base,
                       vars=self.vars,
                       vals=self._n_var_vals,
                       tree=self._cltree.tree_repr()))