Example #1
0
def test_piecewise_logpdf():
    pw = PieceWise([0, 1], [2], sigma=1, flip=.8)
    # x,z
    pw.simulate(None, [0, 1], None, {2: 1})
    pw.logpdf(None, {0: 1.5, 1: 0}, None, {2: 1})

    # x
    pw.simulate(None, [0], None, {2: 1})
    pw.logpdf(None, {0: 1.5}, None, {2: 1})

    # z
    pw.simulate(None, [1], None, {2: 1})
    assert np.allclose(
        logsumexp([
            pw.logpdf(None, {1: 0}, None, {2: 1}),
            pw.logpdf(None, {1: 1}, None, {2: 1})
        ]), 0)

    # z|x
    pw.simulate(None, [1], {0: 1.5}, {2: 1})
    assert np.allclose(
        logsumexp([
            pw.logpdf(None, {1: 0}, {0: 1.5}, {2: 1}),
            pw.logpdf(None, {1: 1}, {0: 1.5}, {2: 1})
        ]), 0)

    # x|z
    pw.simulate(None, [0], {1: 0}, {2: 1})
    pw.logpdf(None, {0: 1.5}, {1: 0}, {2: 1})
Example #2
0
def test_crp_posterior_logpdf():
    view = retrieve_view()
    fresh_row = {0: 2, 1: 3, 2: .5}
    logps = [
        view.logpdf(None, {view.outputs[0]: k}, fresh_row) for k in [0, 1, 2]
    ]
    assert np.allclose(gu.logsumexp(logps), 0)
Example #3
0
 def test_one(forest, c):
     D_sub = [(i, row) for (i, row) in enumerate(D) if row[0] not in c]
     for rowid, row in D_sub:
         inputs = {i: row[i] for i in forest.inputs}
         targets = [{0: x} for x in xrange(NUM_CLASSES)]
         lps = [forest.logpdf(rowid, q, None, inputs) for q in targets]
         assert np.allclose(gu.logsumexp(lps), 0)
Example #4
0
 def calc_predictive_logp(x, y, regressor, counts, alpha):
     logp_uniform = -np.log(len(counts))
     if not hasattr(regressor, 'classes_'):
         return logp_uniform
     elif x not in regressor.classes_:
         return np.log(alpha) + logp_uniform
     else:
         index = list(regressor.classes_).index(x)
         logp_rf = regressor.predict_log_proba([y])[0][index]
         return gu.logsumexp(
             [np.log(alpha) + logp_uniform,
              np.log(1 - alpha) + logp_rf])
Example #5
0
def view_logpdf(view, rowid, targets, constraints):
    if not view.hypothetical(rowid):
        return _logpdf_row(view, targets, view.Zr(rowid))
    Nk = view.Nk()
    N_rows = len(view.Zr())
    K = view.crp.clusters[0].gibbs_tables(-1)
    lp_crp = [Crp.calc_predictive_logp(k, N_rows, Nk, view.alpha()) for k in K]
    lp_constraints = [_logpdf_row(view, constraints, k) for k in K]
    if all(np.isinf(lp_constraints)):
        raise ValueError('Zero density constraints: %s' % (constraints, ))
    lp_cluster = log_normalize(np.add(lp_crp, lp_constraints))
    lp_targets = [_logpdf_row(view, targets, k) for k in K]
    return logsumexp(np.add(lp_cluster, lp_targets))
Example #6
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     constraints = self.populate_constraints(rowid, targets, constraints)
     # XXX Disable logpdf queries without constraints.
     if inputs:
         raise ValueError('Prohibited inputs: %s' % (inputs,))
     if not constraints:
         raise ValueError('Provide at least one constraint: %s'
             % (constraints,))
     self._validate_simulate_logpdf(rowid, targets, constraints)
     # Retrieve the dataset and neighborhoods.
     dataset, neighborhoods = self._find_neighborhoods(targets, constraints)
     models = [self._create_local_model_joint(targets, dataset[n])
         for n in neighborhoods]
     # Compute logpdf in each neighborhood and simple average.
     lp = [m.logpdf(targets) for m in models]
     return gu.logsumexp(lp) - np.log(len(models))
Example #7
0
def test_bernoulli():
    # Switch for multiprocess (0 is faster).
    multiprocess = 0

    # Create categorical data of DATA_NUM_0 zeros and DATA_NUM_1 ones.
    data = np.transpose(np.array([[0] * DATA_NUM_0 + [1] * DATA_NUM_1]))

    # Run a single chain for a few iterations.
    engine = Engine(data,
                    cctypes=['categorical'],
                    distargs=[{
                        'k': 2
                    }],
                    rng=gu.gen_rng(0),
                    multiprocess=0)
    engine.transition(NUM_ITER, multiprocess=multiprocess)

    # Simulate from hypothetical row and compute the proportion of ones.
    sample = engine.simulate(-1, [0], N=NUM_SIM, multiprocess=multiprocess)[0]
    sum_b = sum(s[0] for s in sample)
    observed_prob_of_1 = (float(sum_b) / float(NUM_SIM))
    true_prob_of_1 = float(DATA_NUM_1) / float(DATA_NUM_0 + DATA_NUM_1)
    # Check 1% relative match.
    assert np.allclose(true_prob_of_1, observed_prob_of_1, rtol=.1)

    # Simulate from observed row as a crash test.
    sample = engine.simulate(1, [0], N=1, multiprocess=multiprocess)

    # Ensure normalized unobserved probabilities.
    p0_uob = engine.logpdf(-1, {0: 0}, multiprocess=multiprocess)[0]
    p1_uob = engine.logpdf(-1, {0: 1}, multiprocess=multiprocess)[0]
    assert np.allclose(gu.logsumexp([p0_uob, p1_uob]), 0)

    # A logpdf query constraining an observed returns an error.
    with pytest.raises(ValueError):
        engine.logpdf(1, {0: 0}, multiprocess=multiprocess)
    with pytest.raises(ValueError):
        engine.logpdf(1, {0: 1}, multiprocess=multiprocess)
Example #8
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     assert targets
     assert inputs.keys() == self.inputs
     y = inputs[self.inputs[0]]
     # Case 1: No evidence on outputs.
     if not constraints:
         # Case 1.1: z in the targets and x in the targets.
         if self.outputs[0] in targets and self.outputs[1] in targets:
             z, x = targets[self.outputs[1]], targets[self.outputs[1]]
             # XXX Check if z in [0, 1]
             logp_z = np.log(self.flip) if z == 0 else np.log(1 - self.flip)
             logp_x = logpdf_normal(x, y + (2 * z - 1), self.sigma)
             logp = logp_x + logp_z
         # Case 1.2: z in the targets only.
         elif self.outputs[1] in targets:
             z = targets[self.outputs[1]]
             logp_z = np.log(self.flip) if z == 0 else np.log(1 - self.flip)
             logp = logp_z
         # Case 1.2: x in the targets only.
         elif self.outputs[0] in targets:
             x = targets[self.outputs[0]]
             logp_xz0 = self.logpdf(rowid, {
                 self.outputs[0]: x,
                 self.outputs[1]: 0
             }, constraints, inputs)
             logp_xz1 = self.logpdf(
                 rowid,
                 {
                     self.outputs[0]: x,
                     self.outputs[1]: 1
                 },
                 constraints,
                 inputs,
             )
             logp = gu.logsumexp([logp_xz0, logp_xz1])
         else:
             raise ValueError('Invalid query pattern: %s %s %s' %
                              (targets, constraints, inputs))
     # Case 2: logpdf of x given the z.
     elif constraints.keys() == [self.outputs[1]]:
         assert targets.keys() == [self.outputs[0]]
         z = constraints[self.outputs[1]]
         x = targets[self.outputs[0]]
         logp_xz = self.logpdf(rowid, {
             self.outputs[0]: x,
             self.outputs[1]: z
         }, None, {self.inputs[0]: y})
         logp_z = self.logpdf(rowid, {self.outputs[1]: z}, None,
                              {self.inputs[0]: y})
         logp = logp_xz - logp_z
     # Case 2: logpdf of z given the x.
     elif constraints.keys() == [self.outputs[0]]:
         assert targets.keys() == [self.outputs[1]]
         z = targets[self.outputs[1]]
         x = constraints[self.outputs[0]]
         logp_xz = self.logpdf(rowid, {
             self.outputs[0]: x,
             self.outputs[1]: z
         }, None, {self.inputs[0]: y})
         logp_x = self.logpdf(rowid, {self.outputs[0]: x}, None,
                              {self.inputs[0]: y})
         logp = logp_xz - logp_x
     else:
         raise ValueError('Invalid query pattern: %s %s %s' %
                          (targets, constraints, inputs))
     return logp
Example #9
0
def relevance_probability(view, rowid_target, rowid_query):
    """Compute probability of customers in same table.

    Given a single target rowid T and list of query rowids Q, compute the
    posterior probability that T and all rowids in Q are assigned to the same
    table, conditioned on all rowids in Q being assigned to the same table as
    well as the row data values xT and xQ.

    Let S be the event of all rowids in Q are assigned to the same table:
        S = [zQ[0] = zQ[1] = ... = zQ[-1]]

    The first quantity of interest is;

        Pr[zT = zQ | xT, xQ, S] = Pr[zT = zQ, xT, xQ, S] / Pr[xT, xQ, S]

        The numerator is:

            Pr[zT = zQ, xT, xQ, S]
              = \sum_k Pr[zT=k, zQ=k, xT, xQ]
              = \sum_k Pr[xT, xQ | zT=K, zQ=k] * Pr[zT=k, zQ=k]

        where k is list of tables in the CRP plus a fresh singleton.

    The second quantity of interest is:
        Pr[zT \ne zQ | xT, xQ, S] = Pr[zT \ne zQ, xT, xQ, S] / Pr[xT, xQ, S]

        The numerator is:

            Pr[zT \ne zQ, xT, xQ, S]
              = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ, xT, xQ]
              = \sum_kT \sum_kQ|kT Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ]

        where kT is list of tables in the CRP plus a fresh singleton, and
        kQ|kT in the inner sum is all tables in the CRP other than kT (plus a
        fresh singleton when kT is itself a singleton).

        For example if the tables are [0, 1] then:
            kT = [0, 1, 2]
            kQ|kT = [[1, 2], [0, 2], [0,1,3]

    If computation is correct then the first and second quantities are equal
    to the normalizer, which is given by:

            Pr[xT, xQ, S]
              = \sum_kQ Pr[zQ[0]=kQ, ..., zQ[-1]=kQ, xT, xQ]
              = \sum_kQ Pr[xT, xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ]
              = \sum_kQ (\sum_kT Pr[xT, zT=kT])
                  Pr[xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ]
              = \sum_kQ (\sum_kT Pr[xT|zT=kT] * Pr[zT=kT| zQ=kQ])
                  Pr[xQ|zQ] * Pr[zQ[0]=kQ, ..., zQ[-1]=kQ]

        where kQ is list of tables in the CRP plus a fresh singleton.

        The inner sum over kT computes the predictive density of xT when all the
        rows in Q are in table kQ marginalizing over all assignments.

    Parameters
    ----------
    view : cgpm.mixtures.View
        View CGPM representing the DP mixture.
    rowid_target : int
        The target rowid, must be incorporate in the view.
    rowid_query : list<int>
        The query rowids, must be incorporated in the view.

    Returns
    -------
    relevance_probability : float
        The posterior probability the target is in the same cluster as query.
    """

    if len(rowid_query) < 1:
        raise ValueError('No query rows:, %s' % (rowid_query))
    if rowid_target in rowid_query:
        return 1.

    # Retrieve target crp assignments and data to restore later.
    assignments_target = view.Zr(rowid_target)
    values_target = row_values(view, rowid_target)

    # Retrieve query crp assignments and data to restore later.
    values_query = [row_values(view, r) for r in rowid_query]
    assignments_query = [view.Zr(r) for r in rowid_query]

    # Retrieve view logpdf to verify no mutation afterwards.
    if check_env_debug():
        logpdf_score_full = view.logpdf_score()

    # Unincorporate target and query rows.
    view.unincorporate(rowid_target)
    for rowid_q in rowid_query:
        view.unincorporate(rowid_q)

    # Retrieve current tables.
    tables_crp = sorted(view.crp.clusters[0].counts)

    # Retrieve cluster-wise marginal likelhoods.
    tables_same = get_tables_same(tables_crp)
    logps_clusters = [get_view_logpdf_score(view, t, t) for t in tables_same]

    # Compute Pr[xT, xQ, S]
    #   = \sum_kT \sum_kQ Pr[zT=kT, zQ=kQ, xT, xQ]
    #   = \sum_kT \sum_kQ Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ]
    logps_condition = [
        logpdf_assignments_marginalize_target(view, rowid_query, rowid_query,
                                              values_target, values_query,
                                              table_query)
        for table_query in tables_same
    ]
    logp_condition = logsumexp(np.subtract(logps_condition, logps_clusters))

    # Compute Pr[zT = zQ, xT, xQ, S]
    #   = \sum_k Pr[zT=k, zQ=k, xT, xQ]
    #   = \sum_k Pr[xT, xQ | zT=K, zQ=k] * Pr[zT=k, zQ=k]
    logps_same_table = [
        logpdf_assignments(
            view,
            rowid_target,
            rowid_query,
            values_target,
            values_query,
            table,
            table,
        ) for table in tables_same
    ]
    logp_same_table = logsumexp(np.subtract(logps_same_table, logps_clusters))

    # ----------------------------------------------------------------------
    # The following computation is not necessary and introduces O(K^2)
    # overhead due to the nested sum, but serves as a vital check for
    # correct implementation (as noted in the docstring.)
    # ----------------------------------------------------------------------
    # Compute Pr[zT \ne zQ, xT, xQ, S]
    #   = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ, xT, xQ]
    #   = \sum_kT \sum_kQ|kT Pr[xT, xQ | zT=kT, zQ=kQ] * Pr[zT=kT, zQ=kQ]
    if check_env_debug():
        tables_target, tables_query = get_tables_different(tables_crp)
        # Compute the base logps.
        logps_clusters_diff = [[
            get_view_logpdf_score(
                view,
                table_target,
                table_q,
            ) for table_q in table_query
        ] for table_target, table_query in zip(tables_target, tables_query)]
        # Compute the new logps.
        logps_diff_table = [[
            logpdf_assignments(
                view,
                rowid_target,
                rowid_query,
                values_target,
                values_query,
                table_target,
                table_q,
            ) for table_q in table_query
        ] for table_target, table_query in zip(tables_target, tables_query)]
        # Compute the deltas.
        logps_delta = [
            np.subtract(a, b)
            for (a, b) in zip(logps_diff_table, logps_clusters_diff)
        ]
        # Sum the deltas.
        logp_diff_table = logsumexp([logsumexp(l) for l in logps_delta])

        # Confirm logp_same_table + logp_diff_table equal normalizing constant.
        assert np.allclose(logsumexp([logp_same_table, logp_diff_table]),
                           logp_condition)

    # Restore the target row.
    values_target[view.outputs[0]] = assignments_target
    view.incorporate(rowid_target, values_target)

    # Restore the query rows.
    for rowid, values, z in zip(rowid_query, values_query, assignments_query):
        values[view.outputs[0]] = z
        view.incorporate(rowid, values)

    # Confirm no mutation has occured.
    if check_env_debug():
        assert np.allclose(view.logpdf_score(), logpdf_score_full)

    # Return the relevance probability.
    return np.exp(logp_same_table - logp_condition)
Example #10
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     # As discussed in https://github.com/probcomp/cgpm/issues/116 for an
     # observed rowid, we synthetize a new hypothetical row which is
     # identical (in terms of observed and latent values) to the observed
     # rowid. In this version of the implementation, the user may not
     # override any non-null values in the observed rowid
     # (_populate_constraints returns an error in this case). A user should
     # either (i) use another rowid, since overriding existing values in the
     # observed rowid no longer specifies that rowid, or (ii) use some
     # sequence of incorporate/unicorporate depending on their query.
     constraints = self._populate_constraints(rowid, targets, constraints)
     if not self.hypothetical(rowid):
         rowid = None
     # Prepare the importance network.
     network = self.build_network()
     if self.outputs[0] in constraints:
         # Condition on the cluster assignment.
         # p(xT|xC,z=k)                      computed directly by network.
         return network.logpdf(rowid, targets, constraints, inputs)
     elif self.outputs[0] in targets:
         # Query the cluster assignment.
         # p(z=k,xT|xC)
         # = p(z=k,xT,xC) / p(xC)            Bayes rule
         # = p(z=k)p(xT,xC|z=k) / p(xC)      chain rule on numerator
         # The terms are then:
         # p(z=k)                            lp_cluster
         # p(xT,xC|z=k)                      lp_numer
         # p(xC)                             lp_denom
         k = targets[self.outputs[0]]
         constraints_z = {self.outputs[0]: k}
         targets_nz = {
             c: targets[c]
             for c in targets if c != self.outputs[0]
         }
         targets_numer = merged(targets_nz, constraints)
         lp_cluster = network.logpdf(rowid, constraints_z, inputs)
         lp_numer = \
             network.logpdf(rowid, targets_numer, constraints_z, inputs) \
             if targets_numer else 0
         lp_denom = self.logpdf(rowid, constraints) if constraints else 0
         return (lp_cluster + lp_numer) - lp_denom
     else:
         # Marginalize over cluster assignment by enumeration.
         # Let K be a list of values for the support of z:
         # P(xT|xC)
         # = \sum_k p(xT|z=k,xC)p(z=k|xC)            marginalization
         # Now consider p(z=k|xC) \propto p(z=k,xC)  Bayes rule
         # p(z=K[i],xC)                              lp_constraints_unorm[i]
         # p(z=K[i]|xC)                              lp_constraints[i]
         # p(xT|z=K[i],xC)                           lp_targets[i]
         K = self.crp.clusters[0].gibbs_tables(-1)
         constraints = [
             merged(constraints, {self.outputs[0]: k}) for k in K
         ]
         lp_constraints_unorm = [
             network.logpdf(rowid, const, None, inputs)
             for const in constraints
         ]
         lp_constraints = gu.log_normalize(lp_constraints_unorm)
         lp_targets = [
             network.logpdf(rowid, targets, const, inputs)
             for const in constraints
         ]
         return gu.logsumexp(np.add(lp_constraints, lp_targets))
Example #11
0
def test_logsumexp():
    inf = float('inf')
    nan = float('nan')
    with pytest.raises(OverflowError):
        math.log(sum(map(math.exp, range(1000))))
    assert relerr(999.4586751453871, gu.logsumexp(range(1000))) < 1e-15
    assert gu.logsumexp([]) == -inf
    assert gu.logsumexp([-1000.]) == -1000.
    assert gu.logsumexp([-1000., -1000.]) == -1000. + math.log(2.)
    assert relerr(math.log(2.), gu.logsumexp([0., 0.])) < 1e-15
    assert gu.logsumexp([-inf, 1]) == 1
    assert gu.logsumexp([-inf, -inf]) == -inf
    assert gu.logsumexp([+inf, +inf]) == +inf
    assert math.isnan(gu.logsumexp([-inf, +inf]))
    assert math.isnan(gu.logsumexp([nan, inf]))
    assert math.isnan(gu.logsumexp([nan, -3]))
Example #12
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     if rowid in self.rowid_to_component:
         # Condition on the cluster assignment directly.
         # p(xT|xC,z=k)
         assert not constraints or self.indexer not in constraints
         z = self.rowid_to_component[rowid]
         return self._logpdf_one(rowid, targets, constraints, inputs, z)
     elif self.indexer in targets:
         # Query the cluster assignment.
         # p(z=k,xT|xC)
         # = p(z=k,xT,xC) / p(xC)            Bayes rule
         # = p(z=k)p(xT,xC|z=k) / p(xC)      chain rule on numerator
         # The terms are then:
         # p(z=k)                            lp_z
         # p(xT,xC|z=k)                      lp_x_joint
         # p(xC) = \sum_z P(xC,z)            lp_x_constraints (recursively)
         z = targets[self.indexer]
         inputs_z = get_intersection(self.inputs_z, inputs)
         lp_z = self.cgpm_row_divide.logpdf(rowid=rowid,
                                            targets={self.indexer: z},
                                            constraints=None,
                                            inputs=inputs_z)
         targets_joint = merged(targets, constraints or {})
         lp_x_joint = self._logpdf_one(rowid=rowid,
                                       targets=targets_joint,
                                       constraints=None,
                                       inputs=inputs,
                                       component=z)
         lp_x_constraints = self.logpdf(rowid=rowid,
                                        targets=constraints,
                                        constraints=None,
                                        inputs=inputs) if constraints else 0
         return (lp_z + lp_x_joint) - lp_x_constraints
     elif constraints and self.indexer in constraints:
         # Condition on the cluster assignment
         # P(xT|xC,z=k)
         # = P(xT,xC,z=k) / P(xC,z=k)
         # = P(xT,xC|z=k)P(z=k) / P(xC|z=k)
         # = P(xT,xC|z=k) / P(xC|z=k)
         # The terms are then:
         # P(xT,xC|z=k)                  lp_x_joint
         # P(xC|z=k)                     lp_x_constraints
         z = constraints[self.indexer]
         if z not in self.cgpm_row_divide.support():
             raise ValueError('Constrained cluster has 0 density: %s' %
                              (z, ))
         targets_joint = merged(targets, constraints)
         lp_x_joint = self._logpdf_one(rowid=rowid,
                                       targets=targets_joint,
                                       constraints=None,
                                       inputs=inputs,
                                       component=z)
         lp_x_constraints = self._logpdf_one(rowid=rowid,
                                             targets=constraints,
                                             constraints=None,
                                             inputs=inputs,
                                             component=z)
         return lp_x_joint - lp_x_constraints
     else:
         # Marginalize over cluster assignment by enumeration.
         # Let K be a list of values for the support of z:
         # P(xT|xC)
         # = \sum_i P(xT,z=K[i]|xC)
         # = \sum_i P(xT|xC,z=K[i])P(z=K[i]|xC)  chain rule
         #
         # The posterior is given by:
         # P(z=K[i]|xC) = P(xC|z=K[i])P(z=K[i]) / \sum_i P(xC,z=K[i])
         #
         # The terms are therefore
         # P(z=K[i])                            lp_z_prior[i]
         # P(xC|z=K[i])                         lp_constraints_likelihood[i]
         # P(xC,z=K[i])                         lp_z_constraints[i]
         # P(z=K[i]|xC)                         lp_z_posterior[i]
         # P(xT|xC,z=K[i])                      lp_targets_likelihood[i]
         # P(xT|xC,z=K[i])P(z=K[i]|xC)          lp_joint[i]
         inputs_z = get_intersection(self.inputs_z, inputs)
         z_support = self.cgpm_row_divide.support()
         lp_z_prior = [
             self.cgpm_row_divide.logpdf(rowid, {self.indexer: z}, None,
                                         inputs_z) for z in z_support
         ]
         lp_constraints_likelihood = [
             self._logpdf_one(rowid, constraints, None, inputs, z)
             for z in z_support
         ]
         lp_z_constraints = np.add(lp_z_prior, lp_constraints_likelihood)
         lp_z_posterior = log_normalize(lp_z_constraints)
         lp_targets_likelihood = [
             self._logpdf_one(rowid, targets, constraints, inputs, z)
             for z in z_support
         ]
         lp_joint = np.add(lp_targets_likelihood, lp_z_posterior)
         return logsumexp(lp_joint)
Example #13
0
# Joint equals chain rule for state 1.
joint = state.logpdf(-1, {0: 1, 1: 2})
chain = state.logpdf(-1, {0: 1}, {1: 2}) + state.logpdf(-1, {1: 2})
assert np.allclose(joint, chain)

if False:
    state2 = State(T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(12))
    state2.transition(N=10, progress=1)

    # Joint equals chain rule for state 2.
    state2.logpdf(-1, {0: 1, 1: 2})
    state2.logpdf(-1, {0: 1}, {1: 2}) + state2.logpdf(-1, {1: 2})

    # Take the Monte Carlo average of the conditional.
    mc_conditional = np.log(.5) + gu.logsumexp(
        [state.logpdf(-1, {0: 1}, {1: 2}),
         state2.logpdf(-1, {0: 1}, {1: 2})])

    # Take the Monte Carlo average of the joint.
    mc_joint = np.log(.5) + gu.logsumexp(
        [state.logpdf(-1, {
            0: 1,
            1: 2
        }), state2.logpdf(-1, {
            0: 1,
            1: 2
        })])

    # Take the Monte Carlo average of the marginal.
    mc_marginal = np.log(.5) + gu.logsumexp(
        [state.logpdf(-1, {1: 2}),
Example #14
0
def test_crp_same_table_probability():
    """Compute probability of customers in same table.

    Given a single target rowid T and list of query rowids Q, compute the
    posterior probability that T and all rowids in Q are assigned to the same
    table, conditioned on all rowids in Q being assigned to the same table.

    Let S be the event of all rowids in Q are assigned to the same table:

        S = [zQ[0] = zQ[1] = ... = zQ[-1]]

    The first quantity of interest is;

        Pr[zT = zQ | S] = Pr[zT = zQ, S] / Pr[S]

        The numerator is:

            Pr[zT = zQ, S] = \sum_k Pr[zT=k, zQ=k,]

        where k is list of tables in the CRP plus a fresh singleton.

    The second quantity of interest is:
        Pr[zT \ne zQ |S] = Pr[zT \ne zQ, S] / Pr[S]

        The numerator is:

            Pr[zT \ne zQ, S] = \sum_kT \sum_kQ|kT Pr[zT=kT, zQ=kQ]

        where kT is list of tables in the CRP plus a fresh singleton, and
        kQ|kT in the inner sum is all tables in the CRP other than kT (plus a
        fresh singleton when kT is itself a singleton).

        For example if the tables are [0, 1] then:
            kT = [0, 1, 2]
            kQ|kT = [[1, 2], [0, 2], [0,1,3]

    If computation is correct then the first and second quantities are equal
    to the normalizer, which is given by:

            Pr[S] = \sum_k Pr[zQ[0]=k, ..., zQ[-1]=k]

        where kQ is list of tables in the CRP plus a fresh singleton.
    """
    crp = Crp(outputs=[0],
              inputs=None,
              hypers={'alpha': 1.5},
              rng=gu.gen_rng(1))

    assignments = [
        (0, {
            0: 0
        }),
        (1, {
            0: 0
        }),
        (2, {
            0: 2
        }),
        (3, {
            0: 2
        }),
        (4, {
            0: 2
        }),
        (5, {
            0: 2
        }),
        (6, {
            0: 6
        }),
        (7, {
            0: 6
        }),
        (8, {
            0: 7
        }),
    ]

    for rowid, query in assignments:
        crp.incorporate(rowid, query)

    # Compute probability that (rowid=1) has same assignment of (rowid=4,6,7),
    # given that (rowid=4,6,7) have the same assignment.

    rowid_target = [1]
    rowid_query = [4, 6, 7]

    # Retrieve current assignment to restore later.
    assignment_target = [crp.data[r] for r in rowid_target]
    assignment_query = [crp.data[r] for r in rowid_query]

    # Retrieve CRP statistics to verify no mutation afterwards.
    logpdf_score_full = crp.logpdf_score()
    crp_data_full = crp.data

    for rowid in rowid_target + rowid_query:
        crp.unincorporate(rowid)

    # Final marginal likelihood after unincorporating.
    logpdf_score_truncated = crp.logpdf_score()

    # Retrieve current tables plus a singleton.
    tables_crp = sorted(crp.counts)

    # Exactly 1 target rowid requried.
    assert len(rowid_target) == 1

    def retrieve_logpdf_assignments(rowid_target, rowid_query, t_target,
                                    t_query):
        for rowid in rowid_target:
            crp.incorporate(rowid, {crp.outputs[0]: t_target})
        for rowid in rowid_query:
            crp.incorporate(rowid, {crp.outputs[0]: t_query})
        lp_predictive = crp.logpdf_score() - logpdf_score_truncated
        for rowid in rowid_target + rowid_query:
            crp.unincorporate(rowid)
        return lp_predictive

    # Return list of tables to iterate over when query, target in same table.
    def get_tables_same(tables):
        singleton = max(tables) + 1
        return tables + [singleton]

    # Return list of tables to iterate over when query, target in diff table.
    def get_tables_different(tables):
        singleton = max(tables) + 1
        tables_query = tables + [singleton]
        auxiliary_table = lambda t: [] if t < singleton else [singleton + 1]
        tables_target = [
            filter(lambda x: x != t, tables_query) + auxiliary_table(t)
            for t in tables_query
        ]
        return tables_query, tables_target

    # Some quick tests for get_tables_different.
    assert get_tables_different([0, 1]) == ([0, 1, 2], [[1, 2], [0, 2],
                                                        [0, 1, 3]])
    assert get_tables_different([1, 2]) == ([1, 2, 3], [[2, 3], [1, 3],
                                                        [1, 2, 4]])

    # Compute Pr[zT = zQ, zQ[0] = ... zQ[-1]].
    tables_same = get_tables_same(tables_crp)
    logp_same_table = gu.logsumexp([
        retrieve_logpdf_assignments(rowid_target, rowid_query, t, t)
        for t in tables_same
    ])

    # Compute Pr[zT \ne zQ, zQ[0] = ... zQ[-1]].
    tables_target, tables_query = get_tables_different(tables_crp)
    logp_diff_table = gu.logsumexp([
        gu.logsumexp([
            retrieve_logpdf_assignments(rowid_target, rowid_query, t_target,
                                        t_q) for t_q in t_query
        ]) for t_target, t_query in zip(tables_target, tables_query)
    ])

    # Compute Pr[zT \ne zQ, zQ[0] = ... zQ[-1]] by switching order of sum.
    tables_query, tables_target = get_tables_different(tables_crp)
    logp_diff_table2 = gu.logsumexp([
        gu.logsumexp([
            retrieve_logpdf_assignments(rowid_query, rowid_target, t_query,
                                        t_t) for t_t in t_target
        ]) for t_query, t_target in zip(tables_query, tables_target)
    ])

    # Confirm logp_diff_table is the same regardless of sum order.
    assert np.allclose(logp_diff_table, logp_diff_table2)

    # Compute Pr[zQ[0] = ... = zQ[-1]].
    tables_condition = get_tables_same(tables_crp)
    logp_condition = gu.logsumexp([
        retrieve_logpdf_assignments([], rowid_query, t, t)
        for t in tables_condition
    ])

    # Confirm logp_same_table + logp_diff_table equal normalizing constant.
    assert np.allclose(gu.logsumexp([logp_same_table, logp_diff_table]),
                       logp_condition)

    # Confirm direct spaces probabilities sum to one.
    p_same_table = np.exp(logp_same_table - logp_condition)
    p_diff_table = np.exp(logp_diff_table - logp_condition)
    assert np.allclose(p_same_table + p_diff_table, 1.0)

    # Restore assignments.
    for rowid, assignment in zip(rowid_target, assignment_target):
        crp.incorporate(rowid, {crp.outputs[0]: assignment})

    for rowid, assignment in zip(rowid_query, assignment_query):
        crp.incorporate(rowid, {crp.outputs[0]: assignment})

    # Confirm no mutation has occured.
    assert crp.data == crp_data_full
    assert crp.logpdf_score() == logpdf_score_full
Example #15
0
 def logpdf_marginal(self, z):
     return gu.logsumexp([
         np.log(.5) + norm.logpdf(z, loc=mx, scale=self.noise)
         for mx in set(self.mx)
     ])
Example #16
0
 def logpdf_joint(self, x, y):
     return gu.logsumexp([
         np.log(.25) + norm.logpdf(x, loc=mx, scale=self.noise) +
         norm.logpdf(y, loc=my, scale=self.noise)
         for (mx, my) in zip(self.mx, self.my)
     ])