Exemple #1
0
 def unobserve(self, rowid):
     obs_z, inputs_z = self.cgpm_row_divide.unobserve(rowid)
     obs_x, inputs_x = self.cgpm_components_array.unobserve(rowid)
     del self.rowid_to_component[rowid]
     observation = merged(obs_z, obs_x)
     inputs = merged(inputs_z, inputs_x)
     return observation, inputs
Exemple #2
0
 def _check_partitions(self):
     if not cu.check_env_debug():
         return
     # For debugging only.
     assert self.alpha() > 0.
     # Check that the number of dims actually assigned to the view
     # matches the count in Nv.
     Zr = self.Zr()
     Nk = self.Nk()
     rowids = range(self.n_rows())
     assert set(Zr.keys()) == set(rowids)
     assert set(Zr.values()) == set(Nk)
     for i, dim in self.dims.iteritems():
         # Assert first output is first input of the Dim.
         assert self.outputs[0] == dim.inputs[0]
         # Assert length of dataset is the same as rowids.
         assert len(self.X[i]) == len(rowids)
         # Ensure number of clusters in each dim in views[v]
         # is the same and as described in the view (K, Nk).
         assignments = merged(dim.Zr, dim.Zi)
         assert assignments == Zr
         assert set(assignments.values()) == set(Nk.keys())
         all_ks = dim.clusters.keys() + dim.Zi.values()
         assert set(all_ks) == set(Nk.keys())
         for k in dim.clusters:
             # Law of conservation of rowids.
             rowids_k = [r for r in rowids if Zr[r] == k]
             cols = [dim.index]
             if dim.is_conditional():
                 cols.extend(dim.inputs[1:])
             data = [[self.X[c][r] for c in cols] for r in rowids_k]
             rowids_nan = np.any(np.isnan(data), axis=1) if data else []
             assert (dim.clusters[k].N + np.sum(rowids_nan) == Nk[k])
Exemple #3
0
def observe_one(cgpm, rowid, observation, inputs):
    observation_cgpm = get_intersection(cgpm.outputs, observation)
    if observation_cgpm:
        inputs_cgpm_parents = get_intersection(cgpm.inputs, observation)
        inputs_cgpm_exog = get_intersection(cgpm.inputs, inputs)
        inputs_cgpm = merged(inputs_cgpm_parents, inputs_cgpm_exog)
        cgpm.observe(rowid, observation_cgpm, inputs_cgpm)
Exemple #4
0
def test_incorporate_session():
    rng = gu.gen_rng(4)
    state = State(X,
                  cctypes=['normal'] * 5,
                  Zv={
                      0: 0,
                      1: 0,
                      2: 1,
                      3: 1,
                      4: 2
                  },
                  rng=rng)
    # Incorporate row into a singleton cluster for all views.
    previous = [len(state.views[v].Nk()) for v in [0, 1, 2]]
    data = {i: rng.normal() for i in xrange(5)}
    clusters = {
        state.views[0].outputs[0]: previous[0],
        state.views[1].outputs[0]: previous[1],
        state.views[2].outputs[0]: previous[2],
    }
    state.incorporate(state.n_rows(), gu.merged(data, clusters))
    assert [len(state.views[v].Nk()) for v in [0,1,2]] == \
        [p+1 for p in previous]
    # Incorporate row without specifying clusters, and some missing values
    data = {i: rng.normal() for i in xrange(2)}
    state.incorporate(state.n_rows(), data)
    state.transition(N=3)
    # Remove the incorporated rowid.
    state.unincorporate(state.n_rows() - 1)
    state.transition(N=3)
Exemple #5
0
def test_simulate(seed):
    rng = gu.gen_rng(bytearray(seed))

    iris = load_iris()
    indices = rng.uniform(0, 1, size=len(iris.data)) <= .75

    Y_train = iris.data[indices]
    X_train = iris.target[indices]

    Y_test = iris.data[~indices]
    X_test = iris.target[~indices]

    forest = Dim(outputs=[5],
                 inputs=[-1] + range(4),
                 cctype='random_forest',
                 distargs={
                     'inputs': {
                         'stattypes': ['normal'] * 4
                     },
                     'k': len(iris.target_names)
                 },
                 rng=rng)

    forest.transition_hyper_grids(X_test)

    # Incorporate data into 1 cluster.
    for rowid, (x, y) in enumerate(zip(X_train, Y_train)):
        observation = {5: x}
        inputs = gu.merged({-1: 0}, {i: t for (i, t) in zip(range(4), y)})
        forest.incorporate(rowid, observation, inputs)

    # Transitions.
    for _i in xrange(2):
        forest.transition_hypers()
        forest.transition_params()

    correct, total = 0, 0.
    for rowid, (x, y) in enumerate(zip(X_test, Y_test)):
        inputs = gu.merged({-1: 0}, {i: t for (i, t) in zip(range(4), y)})
        samples = forest.simulate(None, [5], None, inputs, 10)
        prediction = np.argmax(np.bincount([s[5] for s in samples]))
        correct += (prediction == x)
        total += 1.

    # Classification should be better than random.
    assert correct / total > 1. / forest.distargs['k']
Exemple #6
0
def state_simulate(state, rowid, targets, constraints=None, N=None):
    targets_lookup, constraints_lookup = partition_query_evidence(
        state.Zv(), targets, constraints)
    N_sim = N if N is not None else 1
    draws = (view_simulate(view=state.views[v],
                           rowid=rowid,
                           targets=targets_lookup[v],
                           constraints=constraints_lookup.get(v, {}),
                           N=N_sim) for v in targets_lookup)
    samples = [merged(*l) for l in zip(*draws)]
    return samples if N is not None else samples[0]
Exemple #7
0
def convert_view_to_rowmixture(view, rng):
    cgpms = [convert_dim_to_base_cgpm(d, rng) for d in view.dims.itervalues()]
    component_base_cgpms = Product(cgpms, rng=rng)
    cgpm_row_divide = convert_dim_to_base_cgpm(view.crp, rng)
    cgpm_row_mixture = FlexibleRowMixture(cgpm_row_divide, component_base_cgpms,
        rng=rng)
    for rowid, assignment in rebase_cgpm_row_assignments(view.Zr()):
        obs_z = {cgpm_row_divide.outputs[0]: assignment}
        obs_x = {c: view.X[c][rowid] for c in component_base_cgpms.outputs}
        observation = merged(obs_z, obs_x)
        cgpm_row_mixture.observe(rowid, observation)
    return cgpm_row_mixture
Exemple #8
0
 def _bulk_incorporate(self, dim):
     # XXX Major hack! We should really be creating new Dim objects.
     dim.clusters = {}  # Mapping of cluster k to the object.
     dim.Zr = {}  # Mapping of non-nan rowids to cluster k.
     dim.Zi = {}  # Mapping of nan rowids to cluster k.
     dim.aux_model = dim.create_aux_model()
     for rowid, k in self.Zr().iteritems():
         observation = {dim.index: self.X[dim.index][rowid]}
         inputs = self._get_input_values(rowid, dim, k)
         dim.incorporate(rowid, observation, inputs)
     assert merged(dim.Zr, dim.Zi) == self.Zr()
     dim.transition_params()
Exemple #9
0
 def simulate(self, rowid, targets, constraints=None, inputs=None, N=None):
     # Refer to comment in logpdf.
     constraints = self._populate_constraints(rowid, targets, constraints)
     if not self.hypothetical(rowid):
         rowid = None
     network = self.build_network()
     # Condition on the cluster assignment.
     if self.outputs[0] in constraints:
         return network.simulate(rowid, targets, constraints, inputs, N)
     # Determine how many samples to return.
     unwrap_result = N is None
     if unwrap_result:
         N = 1
     # Expose cluster assignments to the samples?
     exposed = self.outputs[0] in targets
     if exposed:
         targets = [q for q in targets if q != self.outputs[0]]
     # Weight clusters by probability of constraints in each cluster.
     K = self.crp.clusters[0].gibbs_tables(-1)
     constr2 = [merged(constraints, {self.outputs[0]: k}) for k in K]
     lp_constraints_unorm = [network.logpdf(rowid, ev) for ev in constr2]
     # Find number of samples in each cluster.
     Ks = gu.log_pflip(lp_constraints_unorm, array=K, size=N, rng=self.rng)
     counts = {k: n for k, n in enumerate(np.bincount(Ks)) if n > 0}
     # Add the cluster assignment to the constraints and sample the rest.
     constr3 = {
         k: merged(constraints, {self.outputs[0]: k})
         for k in counts
     }
     samples = [
         network.simulate(rowid, targets, constr3[k], inputs, counts[k])
         for k in counts
     ]
     # If cluster assignments are exposed, append them to the samples.
     if exposed:
         samples = [[merged(l, {self.outputs[0]: k}) for l in s]
                    for s, k in zip(samples, counts)]
     # Return 1 sample if N is None, otherwise a list.
     result = list(itertools.chain.from_iterable(samples))
     return result[0] if unwrap_result else result
Exemple #10
0
 def _logpdf_one(self, rowid, targets, constraints, inputs, component):
     """Assess logpdf in fixed mixture component."""
     targets_x = get_intersection(self.outputs_x, targets)
     if not targets_x:
         return 0
     constraints_x = get_intersection(self.outputs_x, constraints)
     inputs_x = get_intersection(self.outputs_x, inputs)
     inputs_arr = merged(inputs_x, {self.indexer: component})
     return self.cgpm_components_array.logpdf(
         rowid=rowid,
         targets=targets_x,
         constraints=constraints_x,
         inputs=inputs_arr,
     )
Exemple #11
0
 def populate_constraints(self, rowid, targets, constraints):
     if constraints is None:
         constraints = {}
     if rowid in self.data:
         values = self.data[rowid]
         assert len(values) == len(self.outputs)
         observations = {
             output: value
             for output, value in zip(self.outputs, values)
             if not np.isnan(value) and output not in targets
             and output not in constraints
         }
         constraints = gu.merged(constraints, observations)
     return constraints
Exemple #12
0
def test_transition_hypers():
    forest = Dim(outputs=RF_OUTPUTS,
                 inputs=[-1] + RF_INPUTS,
                 cctype='random_forest',
                 distargs=RF_DISTARGS,
                 rng=gu.gen_rng(0))
    forest.transition_hyper_grids(D[:, 0])

    # Create two clusters.
    Zr = np.zeros(len(D), dtype=int)
    Zr[len(D) / 2:] = 1
    for rowid, row in enumerate(D[:25]):
        observation = {0: row[0]}
        inputs = gu.merged({i: row[i] for i in forest.inputs}, {-1: Zr[rowid]})
        forest.incorporate(rowid, observation, inputs)
Exemple #13
0
def mutual_information(cgpm, targets0, targets1, constraints=None,
        marginalize=None, T=None, N=None):
    _validate_query(cgpm.outputs,targets0, targets1, constraints, marginalize)
    N = N or DEFAULT_SAMPLES_MONTE_CARLO
    T = T or DEFAULT_SAMPLES_MARGINALIZE
    estimator = _get_estimator(targets0, targets1)
    if not marginalize:
        samples_mi = estimator(cgpm, targets0, targets1, constraints, N)
    else:
        samples_marginalize = cgpm.simulate(None, marginalize, N=T)
        constraints_cm = [merged(constraints, m) for m in samples_marginalize]
        estimates = [estimator(cgpm, targets0, targets1, constraint_cm, N)
            for constraint_cm in constraints_cm]
        samples_mi = itertools.chain.from_iterable(estimates)
    return get_estimate(samples_mi)
Exemple #14
0
 def observe(self, rowid, observation, inputs=None):
     if rowid in self.rowid_to_component:
         component = {self.indexer: self.rowid_to_component[rowid]}
     else:
         inputs_z = get_intersection(self.inputs_z, inputs)
         if self.indexer in observation:
             component = {self.indexer: observation[self.indexer]}
         else:
             component = self.cgpm_row_divide.simulate(
                 rowid, [self.indexer], inputs_z)
         inputs_z = get_intersection(self.inputs_z, inputs)
         self.cgpm_row_divide.observe(rowid, component, inputs_z)
         self.rowid_to_component[rowid] = component[self.indexer]
     inputs_x = get_intersection(self.inputs_x, inputs)
     observation_x = get_intersection(self.outputs_x, observation)
     inputs_arr = merged(inputs_x, component)
     self.cgpm_components_array.observe(rowid, observation_x, inputs_arr)
Exemple #15
0
 def _simulate_fallback(self, rowid, targets, N):
     # Fallback: if there is no such constraints to resample from, then
     # resample the first variable.
     merged = len(targets) == len(self.outputs)
     targets_dummy = [o for o in self.outputs if o not in targets]
     if merged:
         assert not targets_dummy
         targets_dummy = [targets[0]]
         targets = targets[1:]
     dataset = self._dataset(targets_dummy)
     indices = self.rng.choice(len(dataset), size=N)
     constraints = [zip(targets_dummy, dataset[i]) for i in indices]
     results = [self.simulate(rowid, targets, dict(e)) for e in constraints]
     # Make sure to add back the resampled first target variable to results.
     if merged:
         results = [gu.merged(s, e) for s, e in zip(results, constraints)]
     return results
Exemple #16
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     constraints = constraints or {}
     inputs = inputs or {}
     # Compute joint probability.
     _samples_joint, weights_joint = zip(*[
         self.weighted_sample(rowid, [], merged(targets, constraints),
                              inputs) for _i in xrange(self.accuracy)
     ])
     logp_joint = logmeanexp(weights_joint)
     # Compute marginal probability.
     _samples_marginal, weights_marginal = zip(*[
         self.weighted_sample(rowid, [], constraints, inputs)
         for _i in xrange(self.accuracy)
     ]) if constraints else ({}, [0.])
     if all(isinf(l) for l in weights_marginal):
         raise ValueError('Zero density constraints: %s' % (constraints, ))
     logp_constraints = logmeanexp(weights_marginal)
     # Return log ratio.
     return logp_joint - logp_constraints
Exemple #17
0
    def _populate_constraints(self, rowid, targets, constraints):
        """Loads constraints from the dataset."""
        if constraints is None:
            constraints = {}
        self._validate_cgpm_query(rowid, targets, constraints)
        # If the rowid is hypothetical, just return.
        if self.hypothetical(rowid):
            return constraints
        # Retrieve all values for this rowid not in targets or constraints.
        data = {
            c: self.X[c][rowid]
            for c in self.outputs[1:]
            if \
                c not in targets \
                and c not in constraints \
                and not isnan(self.X[c][rowid])
        }
        # Add the cluster assignment.
        data[self.outputs[0]] = self.Zr(rowid)

        return merged(constraints, data)
Exemple #18
0
def get_view_observes(view):
    rowids = get_rowids(view)
    # Handle observe for component assignment cgpm.
    cgpm_crp = view.cgpm_row_divide
    observe_crp = OrderedDict([(rowid, get_primitive_observes(cgpm_crp, rowid))
                               for rowid in rowids])
    observe_crp_reindex = reindex_crp_observes(observe_crp.values())
    sorted_rowids = get_sorted_rowids(rowids, observe_crp_reindex)
    rowid_to_index = {rowid: i for i, rowid in enumerate(sorted_rowids)}
    observe_crp_sorted = [
        observe_crp_reindex[rowid_to_index[rowid]] for rowid in sorted_rowids
    ]
    # Handle observe for component data cgpm.
    cgpm_components = view.cgpm_components_array
    observe_components_sorted = [
        get_components_observes(cgpm_components, rowid)
        for rowid in sorted_rowids
    ]
    # Return overall row-wise observation.
    return OrderedDict([(rowid, merged(i0, i1)) for rowid, i0, i1 in zip(
        sorted_rowids, observe_crp_sorted, observe_components_sorted)])
Exemple #19
0
 def _simulate_one(self, rowid, targets, constraints, inputs, N, component):
     """Simulate from a fixed mixture component."""
     targets_x = get_intersection(self.outputs_x, targets)
     if targets_x:
         constraints_x = get_intersection(self.outputs_x, constraints)
         inputs_x = get_intersection(self.outputs_x, inputs)
         inputs_arr = merged(inputs_x, {self.indexer: component})
         samples = self.cgpm_components_array.simulate(
             rowid=rowid,
             targets=targets_x,
             constraints=constraints_x,
             inputs=inputs_arr,
             N=N,
         )
     else:
         samples = {} if N is None else [{}] * N
     if N is None and self.indexer in targets:
         samples[self.indexer] = component
     elif N is not None and self.indexer in targets:
         for sample in samples:
             sample[self.indexer] = component
     return samples
Exemple #20
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     if rowid in self.rowid_to_component:
         # Condition on the cluster assignment directly.
         # p(xT|xC,z=k)
         assert not constraints or self.indexer not in constraints
         z = self.rowid_to_component[rowid]
         return self._logpdf_one(rowid, targets, constraints, inputs, z)
     elif self.indexer in targets:
         # Query the cluster assignment.
         # p(z=k,xT|xC)
         # = p(z=k,xT,xC) / p(xC)            Bayes rule
         # = p(z=k)p(xT,xC|z=k) / p(xC)      chain rule on numerator
         # The terms are then:
         # p(z=k)                            lp_z
         # p(xT,xC|z=k)                      lp_x_joint
         # p(xC) = \sum_z P(xC,z)            lp_x_constraints (recursively)
         z = targets[self.indexer]
         inputs_z = get_intersection(self.inputs_z, inputs)
         lp_z = self.cgpm_row_divide.logpdf(rowid=rowid,
                                            targets={self.indexer: z},
                                            constraints=None,
                                            inputs=inputs_z)
         targets_joint = merged(targets, constraints or {})
         lp_x_joint = self._logpdf_one(rowid=rowid,
                                       targets=targets_joint,
                                       constraints=None,
                                       inputs=inputs,
                                       component=z)
         lp_x_constraints = self.logpdf(rowid=rowid,
                                        targets=constraints,
                                        constraints=None,
                                        inputs=inputs) if constraints else 0
         return (lp_z + lp_x_joint) - lp_x_constraints
     elif constraints and self.indexer in constraints:
         # Condition on the cluster assignment
         # P(xT|xC,z=k)
         # = P(xT,xC,z=k) / P(xC,z=k)
         # = P(xT,xC|z=k)P(z=k) / P(xC|z=k)
         # = P(xT,xC|z=k) / P(xC|z=k)
         # The terms are then:
         # P(xT,xC|z=k)                  lp_x_joint
         # P(xC|z=k)                     lp_x_constraints
         z = constraints[self.indexer]
         if z not in self.cgpm_row_divide.support():
             raise ValueError('Constrained cluster has 0 density: %s' %
                              (z, ))
         targets_joint = merged(targets, constraints)
         lp_x_joint = self._logpdf_one(rowid=rowid,
                                       targets=targets_joint,
                                       constraints=None,
                                       inputs=inputs,
                                       component=z)
         lp_x_constraints = self._logpdf_one(rowid=rowid,
                                             targets=constraints,
                                             constraints=None,
                                             inputs=inputs,
                                             component=z)
         return lp_x_joint - lp_x_constraints
     else:
         # Marginalize over cluster assignment by enumeration.
         # Let K be a list of values for the support of z:
         # P(xT|xC)
         # = \sum_i P(xT,z=K[i]|xC)
         # = \sum_i P(xT|xC,z=K[i])P(z=K[i]|xC)  chain rule
         #
         # The posterior is given by:
         # P(z=K[i]|xC) = P(xC|z=K[i])P(z=K[i]) / \sum_i P(xC,z=K[i])
         #
         # The terms are therefore
         # P(z=K[i])                            lp_z_prior[i]
         # P(xC|z=K[i])                         lp_constraints_likelihood[i]
         # P(xC,z=K[i])                         lp_z_constraints[i]
         # P(z=K[i]|xC)                         lp_z_posterior[i]
         # P(xT|xC,z=K[i])                      lp_targets_likelihood[i]
         # P(xT|xC,z=K[i])P(z=K[i]|xC)          lp_joint[i]
         inputs_z = get_intersection(self.inputs_z, inputs)
         z_support = self.cgpm_row_divide.support()
         lp_z_prior = [
             self.cgpm_row_divide.logpdf(rowid, {self.indexer: z}, None,
                                         inputs_z) for z in z_support
         ]
         lp_constraints_likelihood = [
             self._logpdf_one(rowid, constraints, None, inputs, z)
             for z in z_support
         ]
         lp_z_constraints = np.add(lp_z_prior, lp_constraints_likelihood)
         lp_z_posterior = log_normalize(lp_z_constraints)
         lp_targets_likelihood = [
             self._logpdf_one(rowid, targets, constraints, inputs, z)
             for z in z_support
         ]
         lp_joint = np.add(lp_targets_likelihood, lp_z_posterior)
         return logsumexp(lp_joint)
Exemple #21
0
 def _get_input_values(self, rowid, dim, k):
     """Prepare the inputs for a Dim logpdf or simulate query."""
     inputs = {i: self.X[i][rowid] for i in dim.inputs[1:]}
     cluster = {self.outputs[0]: k}
     return merged(inputs, cluster)
Exemple #22
0
 def _migrate_row(self, rowid, k):
     self.unincorporate(rowid)
     observation = merged({d: self.X[d][rowid]
                           for d in self.dims}, {self.outputs[0]: k})
     self.incorporate(rowid, observation)
Exemple #23
0
 def logpdf(self, rowid, targets, constraints=None, inputs=None):
     # As discussed in https://github.com/probcomp/cgpm/issues/116 for an
     # observed rowid, we synthetize a new hypothetical row which is
     # identical (in terms of observed and latent values) to the observed
     # rowid. In this version of the implementation, the user may not
     # override any non-null values in the observed rowid
     # (_populate_constraints returns an error in this case). A user should
     # either (i) use another rowid, since overriding existing values in the
     # observed rowid no longer specifies that rowid, or (ii) use some
     # sequence of incorporate/unicorporate depending on their query.
     constraints = self._populate_constraints(rowid, targets, constraints)
     if not self.hypothetical(rowid):
         rowid = None
     # Prepare the importance network.
     network = self.build_network()
     if self.outputs[0] in constraints:
         # Condition on the cluster assignment.
         # p(xT|xC,z=k)                      computed directly by network.
         return network.logpdf(rowid, targets, constraints, inputs)
     elif self.outputs[0] in targets:
         # Query the cluster assignment.
         # p(z=k,xT|xC)
         # = p(z=k,xT,xC) / p(xC)            Bayes rule
         # = p(z=k)p(xT,xC|z=k) / p(xC)      chain rule on numerator
         # The terms are then:
         # p(z=k)                            lp_cluster
         # p(xT,xC|z=k)                      lp_numer
         # p(xC)                             lp_denom
         k = targets[self.outputs[0]]
         constraints_z = {self.outputs[0]: k}
         targets_nz = {
             c: targets[c]
             for c in targets if c != self.outputs[0]
         }
         targets_numer = merged(targets_nz, constraints)
         lp_cluster = network.logpdf(rowid, constraints_z, inputs)
         lp_numer = \
             network.logpdf(rowid, targets_numer, constraints_z, inputs) \
             if targets_numer else 0
         lp_denom = self.logpdf(rowid, constraints) if constraints else 0
         return (lp_cluster + lp_numer) - lp_denom
     else:
         # Marginalize over cluster assignment by enumeration.
         # Let K be a list of values for the support of z:
         # P(xT|xC)
         # = \sum_k p(xT|z=k,xC)p(z=k|xC)            marginalization
         # Now consider p(z=k|xC) \propto p(z=k,xC)  Bayes rule
         # p(z=K[i],xC)                              lp_constraints_unorm[i]
         # p(z=K[i]|xC)                              lp_constraints[i]
         # p(xT|z=K[i],xC)                           lp_targets[i]
         K = self.crp.clusters[0].gibbs_tables(-1)
         constraints = [
             merged(constraints, {self.outputs[0]: k}) for k in K
         ]
         lp_constraints_unorm = [
             network.logpdf(rowid, const, None, inputs)
             for const in constraints
         ]
         lp_constraints = gu.log_normalize(lp_constraints_unorm)
         lp_targets = [
             network.logpdf(rowid, targets, const, inputs)
             for const in constraints
         ]
         return gu.logsumexp(np.add(lp_constraints, lp_targets))
Exemple #24
0
def _simulate_row(view, targets, cluster, N):
    """Return sample of the targets in a fixed cluster."""
    samples = (view.dims[c].simulate(None, [c], None,
                                     {view.outputs[0]: cluster}, N)
               for c in targets)
    return (merged(*l) for l in zip(*samples))