コード例 #1
0
class FBGMM(object):
    """
    A finite Bayesian Gaussian mixture model (FBGMM).

    See `GaussianComponents` or `GaussianComponentsDiag` for an overview of the
    parameters not mentioned below.

    Parameters
    ----------
    alpha : float
        Concentration parameter for the symmetric Dirichlet prior over the
        mixture weights.
    K : int
        The number of mixture components. This is actually a maximum number,
        and it is possible to empty out some of these components.
    assignments : vector of int or str
        If vector of int, this gives the initial component assignments. The
        vector should therefore have N entries between 0 and `K`. Values of
        -1 is also allowed, indicating that the data vector does not belong to
        any component. Alternatively, `assignments` can take one of the
        following values:
        - "rand": Vectors are assigned randomly to one of `K` components.
        - "each-in-own": Each vector is assigned to a component of its own.
    covariance_type : str
        String describing the type of covariance parameters to use. Must be
        one of "full", "diag" or "fixed".
    lms : float
        Language model scaling factor.
    """

    def __init__(self, X, prior, alpha, K, assignments="rand",
            covariance_type="full", lms=1.0):
        self.alpha = alpha
        self.prior = prior
        self.covariance_type = covariance_type
        self.lms = lms

        self.setup_components(K, assignments, X)

        # N, D = X.shape

        # # Initial component assignments
        # if assignments == "rand":
        #     assignments = np.random.randint(0, K, N)

        #     # Make sure we have consequetive values
        #     for k in xrange(assignments.max()):
        #         while len(np.nonzero(assignments == k)[0]) == 0:
        #             assignments[np.where(assignments > k)] -= 1
        #         if assignments.max() == k:
        #             break
        # elif assignments == "each-in-own":
        #     assignments = np.arange(N)
        # else:
        #     # `assignments` is a vector
        #     pass

        # if covariance_type == "full":
        #     self.components = GaussianComponents(X, prior, assignments, K_max=K)
        # elif covariance_type == "diag":
        #     self.components = GaussianComponentsDiag(X, prior, assignments, K_max=K)
        # elif covariance_type == "fixed":
        #     self.components = GaussianComponentsFixedVar(X, prior, assignments, K_max=K)
        # else:
        #     assert False, "Invalid covariance type."

    def setup_components(self, K, assignments="rand", X=None):
        """
        Setup the `components` attribute.

        See parameters of `FBGMM` for parameters not described below. This
        function is also useful for resetting the `components`, e.g. if you
        want to change the maximum number of possible components.

        Parameters
        ----------
        X : NxD matrix or None
            The data matrix. If None, then it is assumed that the `components`
            attribute has already been initialized and that this function is
            called to reset the `components`; in this case the data is taken
            from the previous initialization.
        """
        if X is None:
            assert hasattr(self, "components")
            X = self.components.X

        N, D = X.shape

        # Initial component assignments
        if isinstance(assignments, basestring) and assignments == "rand":
            assignments = np.random.randint(0, K, N)
        elif isinstance(assignments, basestring) and assignments == "each-in-own":
            assignments = np.arange(N)
        else:
            # `assignments` is a vector
            pass
        # Make sure we have consequetive values
        for k in xrange(assignments.max()):
            while len(np.nonzero(assignments == k)[0]) == 0:
                assignments[np.where(assignments > k)] -= 1
            if assignments.max() == k:
                break

        if self.covariance_type == "full":
            self.components = GaussianComponents(X, self.prior, assignments, K_max=K)
        elif self.covariance_type == "diag":
            self.components = GaussianComponentsDiag(X, self.prior, assignments, K_max=K)
        elif self.covariance_type == "fixed":
            self.components = GaussianComponentsFixedVar(X, self.prior, assignments, K_max=K)
        else:
            assert False, "Invalid covariance type."

    def set_K(self, K, reassign=True):
        """
        Set the number of components `K`.

        The `K` largest existing components are kept, and the rest of the data
        vectors are re-assigned to one of these (if `reassign` is True).
        """

        if self.components.K <= K:
            # The active components are already less than the new K
            self.components.K_max = K
            return

        sizes = self.components.counts
        old_assignments = self.components.assignments

        # Keep only the `K` biggest assignments
        assignments_to_keep = list(np.argsort(sizes)[-K:])
        new_assignments = [
            i if i in assignments_to_keep else -1 for i in old_assignments
            ]
        mapping = dict([(assignments_to_keep[i], i) for i in range(K)])
        mapping[-1] = -1
        new_assignments = np.array([mapping[i] for i in new_assignments])

        # Make sure we have consequetive assignment values
        for k in xrange(new_assignments.max()):
            while len(np.nonzero(new_assignments == k)[0]) == 0:
                new_assignments[np.where(new_assignments > k)] -= 1
            if new_assignments.max() == k:
                break

        # Create new `components` attribute
        self.setup_components(K, list(new_assignments))

        # Now add back those vectors which were assigned before but are unassigned now
        if reassign:
            for i, old_assignment in enumerate(old_assignments):
                new_assignment = new_assignments[i]
                if old_assignment == -1 or new_assignment != -1:
                    continue
                self.gibbs_sample_inside_loop_i(i)

        # sizes = self.components.counts
        # cur_assignments = self.components.assignments
        # print cur_assignments[:100]

        # # Keep only the `K` biggest assignments
        # assignments_to_keep = list(np.argsort(sizes)[-K:])
        # print assignments_to_keep
        # new_assignments = [
        #     i if i in assignments_to_keep else -1 if i == -1 else
        #     random.choice(assignments_to_keep) for i in cur_assignments
        #     ]
        # mapping = dict([(assignments_to_keep[i], i) for i in range(K)])
        # print mapping
        # mapping[-1] = -1
        # new_assignments = np.array([mapping[i] for i in new_assignments])
        # print new_assignments[:100]

        # # Make sure we have consequetive assignment values
        # for k in xrange(new_assignments.max()):
        #     while len(np.nonzero(new_assignments == k)[0]) == 0:
        #         new_assignments[np.where(new_assignments > k)] -= 1
        #     if new_assignments.max() == k:
        #         break

        # self.setup_components(K, list(new_assignments))

    def log_prob_z(self):
        """
        Return the log marginal probability of component assignment P(z).

        See (24.24) in Murphy, p. 842.
        """
        log_prob_z = (
            gammaln(self.alpha)
            - gammaln(self.alpha + np.sum(self.components.counts))
            + np.sum(
                gammaln(
                    self.components.counts
                    + float(self.alpha)/self.components.K_max
                    )
                - gammaln(self.alpha/self.components.K_max)
                )
            )
        return log_prob_z

    def log_prob_X_given_z(self):
        """Return the log probability of data in each component p(X|z)."""
        return self.components.log_marg()

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""

        log_prob_z = self.log_prob_z()
        log_prob_X_given_z = self.log_prob_X_given_z()

        # # Log probability of component assignment, (24.24) in Murphy, p. 842
        # log_prob_z = (
        #     gammaln(self.alpha)
        #     - gammaln(self.alpha + np.sum(self.components.counts))
        #     + np.sum(
        #         gammaln(
        #             self.components.counts
        #             + float(self.alpha)/self.components.K_max
        #             )
        #         - gammaln(self.alpha/self.components.K_max)
        #         )
        #     )

        # # Log probability of data in each component
        # log_prob_X_given_z = self.components.log_marg()

        return log_prob_z + log_prob_X_given_z

    # @profile
    def log_marg_i(self, i, log_prob_z=[], log_prob_z_given_y=[], scale=False):
        """
        Return the log marginal of the i'th data vector: p(x_i)

        Here it is assumed that x_i is not currently in the acoustic model,
        so the -1 term used in the denominator in (24.26) in Murphy, p. 843
        is dropped (since x_i is already not included in the counts).
        """
        assert i != -1

        if not len(log_prob_z):
          # Compute log probability of `X[i]` belonging to each component
          # (24.26) in Murphy, p. 843
          log_prob_z = self.lms * (
            np.log(float(self.alpha)/self.components.K_max + self.components.counts)
            # - np.log(_cython_utils.sum_ints(self.components.counts) + self.alpha - 1.)
            - np.log(_cython_utils.sum_ints(self.components.counts) + self.alpha)
            )
          if len(log_prob_z_given_y):
            log_prob_z += log_prob_z_given_y
            log_prob_z -= logsumexp(log_prob_z)

        # print('In fbgmm log_marg_i %d, prior: ' % (i) + str(np.min(log_prob_z)) + ' ' + str(np.max(log_prob_z)))
        # log_prob_z = lms * (
        #     np.ones(self.components.K_max)*(
        #         np.log(float(self.alpha)/self.components.K_max + self.components.counts)
        #         - np.log(np.sum(self.components.counts) + self.alpha - 1.)
        #         )
        #     )
        # logger.info("log_prob_z: " + str(log_prob_z))
       
        if scale:
          log_likelihood_z = np.nan * np.ones(log_prob_z.shape)
          log_likelihood_z[:self.components.K] = self.components.log_post_pred(i)
          log_likelihood_z[self.components.K:] = self.components.log_prior(i)
          log_prob_z += log_likelihood_z - _cython_utils.logsumexp(log_likelihood_z)
        else: 
          # (24.23) in Murphy, p. 842
          log_prob_z[:self.components.K] += self.components.log_post_pred(i)
          # Empty (unactive) components
          log_prob_z[self.components.K:] += self.components.log_prior(i)
        # print('In fbgmm log_marg_i %d, posterior: ' % (i) + str(np.min(log_prob_z)) + ' ' + str(np.max(log_prob_z)))
        return _cython_utils.logsumexp(log_prob_z) 
 
    def gibbs_sample(self, n_iter, consider_unassigned=True,
            anneal_schedule=None, anneal_start_temp_inv=0.1,
            anneal_end_temp_inv=1, n_anneal_steps=-1, log_prob_zs=[], log_prob_z_given_ys=[]): #, lms=1.0):
        """
        Perform `n_iter` iterations Gibbs sampling on the FBGMM.

        Parameters
        ----------
        consider_unassigned : bool
            Whether unassigned vectors (-1 in `assignments`) should be
            considered during sampling.
        anneal_schedule : str
            Can be one of the following:
            - None: A constant temperature of `anneal_end_temp_inv` is used
              throughout; if `anneal_end_temp_inv` is left at default (1), then
              this is equivalent to not performing annealing.
            - "linear": Linearly take the inverse temperature from
              `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule,
              annealing is performed over all `n_iter` iterations.
            - "step": Piecewise schedule in which the inverse temperature is
              taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps` steps (annealing will be performed over all
              `n_iter` iterations; it might be worth adding an additional
              variable for this case to allow the step schedule to stop early).

        Return
        ------
        record_dict : dict
            Contains several fields describing the sampling process. Each field
            is described by its key and statistics are given in a list which
            covers the Gibbs sampling iterations.
        """
        
        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        start_time = time.time()
        record_dict["log_marg"] = []
        record_dict["log_prob_z"] = []
        record_dict["log_prob_X_given_z"] = []
        record_dict["anneal_temp"] = []
        record_dict["components"] = []

        # Setup annealing iterator
        if anneal_schedule is None:
            get_anneal_temp = iter([])
        elif anneal_schedule == "linear":
            if n_anneal_steps == -1:
                n_anneal_steps = n_iter
            anneal_list = 1./np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            get_anneal_temp = iter(anneal_list)
        elif anneal_schedule == "step":
            assert not n_anneal_steps == -1, (
                "`n_anneal_steps` of -1 not allowed for step annealing schedule"
                )
            n_iter_per_step = int(round(float(n_iter)/n_anneal_steps))
            anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            anneal_list = 1./anneal_list
            anneal_list = np.repeat(anneal_list, n_iter_per_step)
            get_anneal_temp = iter(anneal_list)

        if len(log_prob_zs):
            log_prob_zs = np.concatenate(log_prob_zs, axis=0)
        if len(log_prob_z_given_ys):
            log_prob_z_given_ys = np.concatenate(log_prob_z_given_ys, axis=0)

        # Loop over iterations
        for i_iter in range(n_iter):

            # Get anneal temperature
            anneal_temp = next(get_anneal_temp, anneal_end_temp_inv)

            # Loop over data items
            count = 0
            for i in xrange(self.components.N):

                # Cache some old values for possible future use
                k_old = self.components.assignments[i]
                if not consider_unassigned and k_old == -1:
                    continue
                K_old = self.components.K
                stats_old = self.components.cache_component_stats(k_old)

                # Remove data vector `X[i]` from its current component
                # TODO Handle this
                self.components.del_item(i)

                # Compute log probability of `X[i]` belonging to each component
                # (24.26) in Murphy, p. 843
                if not len(log_prob_zs):
                  log_prob_z = self.lms * (
                      np.ones(self.components.K_max)*np.log(
                          float(self.alpha)/self.components.K_max + self.components.counts
                          )
                      )
                  if len(log_prob_z_given_ys):
                    log_prob_z += log_prob_z_given_ys[count]
                    log_prob_z -= logsumexp(log_prob_z)

                else:
                  log_prob_z = deepcopy(log_prob_zs[count])
                count += 1
                # (24.23) in Murphy, p. 842
                log_prob_z[:self.components.K] += self.components.log_post_pred(i)
                # Empty (unactive) components
                log_prob_z[self.components.K:] += self.components.log_prior(i)
                if anneal_temp != 1:
                    log_prob_z = log_prob_z - logsumexp(log_prob_z)
                    log_prob_z_anneal = 1./anneal_temp * log_prob_z - logsumexp(1./anneal_temp * log_prob_z)
                    prob_z = np.exp(log_prob_z_anneal)
                else:
                    prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
                # prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

                # Sample the new component assignment for `X[i]`
                k = utils.draw(prob_z)

                # There could be several empty, unactive components at the end
                if k > self.components.K:
                    k = self.components.K
                # print prob_z, k, prob_z[k]

                # Add data item X[i] into its component `k`
                if k == k_old and self.components.K == K_old:
                    # Assignment same and no components have been removed
                    self.components.restore_component_from_stats(k_old, *stats_old)
                    self.components.assignments[i] = k_old
                else:
                    # Add data item X[i] into its new component `k`
                    self.components.add_item(i, k)

            # Update record
            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["log_prob_z"].append(self.log_prob_z())
            record_dict["log_prob_X_given_z"].append(self.log_prob_X_given_z())
            record_dict["anneal_temp"].append(anneal_temp)
            record_dict["components"].append(self.components.K)

            # Log info
            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

        return record_dict

    def gibbs_sample_inside_loop_i(self, i, anneal_temp=1, log_prob_z=[], log_prob_z_given_y=[]): #, lms=1.):
        """
        Perform the inside loop of Gibbs sampling for data vector `i`.

        This is the inside of `gibbs_sample` and can be used by outside objects
        to perform only the inside loop part of the Gibbs sampling operation.
        The step in the loop is sample a new assignment for data vector `i`.
        The reason for not replacing the actual inner part of `gibbs_sample` by
        a call to this function is because this won't allow for caching the old
        component stats.
        """

        if not len(log_prob_z):
          # Compute log probability of `X[i]` belonging to each component
          # (24.26) in Murphy, p. 843
          log_prob_z = self.lms * (
            np.ones(self.components.K_max)*np.log(
                float(self.alpha)/self.components.K_max + self.components.counts
                )
            )
          if len(log_prob_z_given_y):
            log_prob_z += log_prob_z_given_y
            log_prob_z -= logsumexp(log_prob_z)
        # print('Maximum indices according to prior: ' + str(np.argsort(-log_prob_z)[:5]))

        # (24.23) in Murphy, p. 842
        log_prob_z[:self.components.K] += self.components.log_post_pred(i)
        # Empty (unactive) components
        log_prob_z[self.components.K:] += self.components.log_prior(i)

        if anneal_temp != 1:
            log_prob_z = log_prob_z - logsumexp(log_prob_z)
            log_prob_z_anneal = 1./anneal_temp * log_prob_z - logsumexp(1./anneal_temp * log_prob_z)
            prob_z = np.exp(log_prob_z_anneal)
            # print('Maximum indices according to posterior: ' + str(np.argsort(-prob_z)[:5]))
            # print('Maximum probs: ' + str(sorted(prob_z, reverse=True)[:5]))
        else:
            prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
        # prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
        assert not np.isnan(np.sum(prob_z))

        # TODO Try using the viterbi path
        # Sample the new component assignment for `X[i]`
        k = utils.draw(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.components.K:
            k = self.components.K

        logger.debug("Adding item " + str(i) + " to acoustic model component " + str(k))
        self.components.add_item(i, k)

    def map_assign_i(self, i):
        """
        Assign data vector `i` to the component giving the maximum posterior.

        This function is very similar to `gibbs_sample_inside_loop_i`, but
        instead of sampling the assignment, the MAP estimate is used.
        """

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = (
            np.ones(self.components.K_max)*np.log(
                float(self.alpha)/self.components.K_max + self.components.counts
                )
            )
        # (24.23) in Murphy, p. 842
        log_prob_z[:self.components.K] += self.components.log_post_pred(i)
        # Empty (unactive) components
        log_prob_z[self.components.K:] += self.components.log_prior(i)
        prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

        # Take the MAP assignment for `X[i]`
        k = np.argmax(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.components.K:
            k = self.components.K

        logger.debug("Adding item " + str(i) + " to acoustic model component " + str(k))
        self.components.add_item(i, k)

    def get_n_assigned(self):
        """Return the number of assigned data vectors."""
        return len(np.where(self.components.assignments != -1)[0])
コード例 #2
0
ファイル: fbgmm.py プロジェクト: kamperh/segmentalist
class FBGMM(object):
    """
    A finite Bayesian Gaussian mixture model (FBGMM).

    See `GaussianComponents` or `GaussianComponentsDiag` for an overview of the
    parameters not mentioned below.

    Parameters
    ----------
    alpha : float
        Concentration parameter for the symmetric Dirichlet prior over the
        mixture weights.
    K : int
        The number of mixture components. This is actually a maximum number,
        and it is possible to empty out some of these components.
    assignments : vector of int or str
        If vector of int, this gives the initial component assignments. The
        vector should therefore have N entries between 0 and `K`. Values of
        -1 is also allowed, indicating that the data vector does not belong to
        any component. Alternatively, `assignments` can take one of the
        following values:
        - "rand": Vectors are assigned randomly to one of `K` components.
        - "each-in-own": Each vector is assigned to a component of its own.
    covariance_type : str
        String describing the type of covariance parameters to use. Must be
        one of "full", "diag" or "fixed".
    lms : float
        Language model scaling factor.
    """

    def __init__(self, X, prior, alpha, K, assignments="rand",
            covariance_type="full", lms=1.0):
        self.alpha = alpha
        self.prior = prior
        self.covariance_type = covariance_type
        self.lms = lms

        self.setup_components(K, assignments, X)

        # N, D = X.shape

        # # Initial component assignments
        # if assignments == "rand":
        #     assignments = np.random.randint(0, K, N)

        #     # Make sure we have consequetive values
        #     for k in xrange(assignments.max()):
        #         while len(np.nonzero(assignments == k)[0]) == 0:
        #             assignments[np.where(assignments > k)] -= 1
        #         if assignments.max() == k:
        #             break
        # elif assignments == "each-in-own":
        #     assignments = np.arange(N)
        # else:
        #     # `assignments` is a vector
        #     pass

        # if covariance_type == "full":
        #     self.components = GaussianComponents(X, prior, assignments, K_max=K)
        # elif covariance_type == "diag":
        #     self.components = GaussianComponentsDiag(X, prior, assignments, K_max=K)
        # elif covariance_type == "fixed":
        #     self.components = GaussianComponentsFixedVar(X, prior, assignments, K_max=K)
        # else:
        #     assert False, "Invalid covariance type."

    def setup_components(self, K, assignments="rand", X=None):
        """
        Setup the `components` attribute.

        See parameters of `FBGMM` for parameters not described below. This
        function is also useful for resetting the `components`, e.g. if you
        want to change the maximum number of possible components.

        Parameters
        ----------
        X : NxD matrix or None
            The data matrix. If None, then it is assumed that the `components`
            attribute has already been initialized and that this function is
            called to reset the `components`; in this case the data is taken
            from the previous initialization.
        """
        if X is None:
            assert hasattr(self, "components")
            X = self.components.X

        N, D = X.shape

        # Initial component assignments
        if isinstance(assignments, basestring) and assignments == "rand":
            assignments = np.random.randint(0, K, N)
        elif isinstance(assignments, basestring) and assignments == "each-in-own":
            assignments = np.arange(N)
        else:
            # `assignments` is a vector
            pass
        # Make sure we have consequetive values
        for k in xrange(assignments.max()):
            while len(np.nonzero(assignments == k)[0]) == 0:
                assignments[np.where(assignments > k)] -= 1
            if assignments.max() == k:
                break

        if self.covariance_type == "full":
            self.components = GaussianComponents(X, self.prior, assignments, K_max=K)
        elif self.covariance_type == "diag":
            self.components = GaussianComponentsDiag(X, self.prior, assignments, K_max=K)
        elif self.covariance_type == "fixed":
            self.components = GaussianComponentsFixedVar(X, self.prior, assignments, K_max=K)
        else:
            assert False, "Invalid covariance type."

    def set_K(self, K, reassign=True):
        """
        Set the number of components `K`.

        The `K` largest existing components are kept, and the rest of the data
        vectors are re-assigned to one of these (if `reassign` is True).
        """

        if self.components.K <= K:
            # The active components are already less than the new K
            self.components.K_max = K
            return

        sizes = self.components.counts
        old_assignments = self.components.assignments

        # Keep only the `K` biggest assignments
        assignments_to_keep = list(np.argsort(sizes)[-K:])
        new_assignments = [
            i if i in assignments_to_keep else -1 for i in old_assignments
            ]
        mapping = dict([(assignments_to_keep[i], i) for i in range(K)])
        mapping[-1] = -1
        new_assignments = np.array([mapping[i] for i in new_assignments])

        # Make sure we have consequetive assignment values
        for k in xrange(new_assignments.max()):
            while len(np.nonzero(new_assignments == k)[0]) == 0:
                new_assignments[np.where(new_assignments > k)] -= 1
            if new_assignments.max() == k:
                break

        # Create new `components` attribute
        self.setup_components(K, list(new_assignments))

        # Now add back those vectors which were assigned before but are unassigned now
        if reassign:
            for i, old_assignment in enumerate(old_assignments):
                new_assignment = new_assignments[i]
                if old_assignment == -1 or new_assignment != -1:
                    continue
                self.gibbs_sample_inside_loop_i(i)

        # sizes = self.components.counts
        # cur_assignments = self.components.assignments
        # print cur_assignments[:100]

        # # Keep only the `K` biggest assignments
        # assignments_to_keep = list(np.argsort(sizes)[-K:])
        # print assignments_to_keep
        # new_assignments = [
        #     i if i in assignments_to_keep else -1 if i == -1 else
        #     random.choice(assignments_to_keep) for i in cur_assignments
        #     ]
        # mapping = dict([(assignments_to_keep[i], i) for i in range(K)])
        # print mapping
        # mapping[-1] = -1
        # new_assignments = np.array([mapping[i] for i in new_assignments])
        # print new_assignments[:100]

        # # Make sure we have consequetive assignment values
        # for k in xrange(new_assignments.max()):
        #     while len(np.nonzero(new_assignments == k)[0]) == 0:
        #         new_assignments[np.where(new_assignments > k)] -= 1
        #     if new_assignments.max() == k:
        #         break

        # self.setup_components(K, list(new_assignments))

    def log_prob_z(self):
        """
        Return the log marginal probability of component assignment P(z).

        See (24.24) in Murphy, p. 842.
        """
        log_prob_z = (
            gammaln(self.alpha)
            - gammaln(self.alpha + np.sum(self.components.counts))
            + np.sum(
                gammaln(
                    self.components.counts
                    + float(self.alpha)/self.components.K_max
                    )
                - gammaln(self.alpha/self.components.K_max)
                )
            )
        return log_prob_z

    def log_prob_X_given_z(self):
        """Return the log probability of data in each component p(X|z)."""
        return self.components.log_marg()

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""

        log_prob_z = self.log_prob_z()
        log_prob_X_given_z = self.log_prob_X_given_z()

        # # Log probability of component assignment, (24.24) in Murphy, p. 842
        # log_prob_z = (
        #     gammaln(self.alpha)
        #     - gammaln(self.alpha + np.sum(self.components.counts))
        #     + np.sum(
        #         gammaln(
        #             self.components.counts
        #             + float(self.alpha)/self.components.K_max
        #             )
        #         - gammaln(self.alpha/self.components.K_max)
        #         )
        #     )

        # # Log probability of data in each component
        # log_prob_X_given_z = self.components.log_marg()

        return log_prob_z + log_prob_X_given_z

    # @profile
    def log_marg_i(self, i): #, lms=1.):
        """
        Return the log marginal of the i'th data vector: p(x_i)

        Here it is assumed that x_i is not currently in the acoustic model,
        so the -1 term used in the denominator in (24.26) in Murphy, p. 843
        is dropped (since x_i is already not included in the counts).
        """
        assert i != -1

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = self.lms * (
            np.log(float(self.alpha)/self.components.K_max + self.components.counts)
            # - np.log(_cython_utils.sum_ints(self.components.counts) + self.alpha - 1.)
            - np.log(_cython_utils.sum_ints(self.components.counts) + self.alpha)
            )
        # log_prob_z = lms * (
        #     np.ones(self.components.K_max)*(
        #         np.log(float(self.alpha)/self.components.K_max + self.components.counts)
        #         - np.log(np.sum(self.components.counts) + self.alpha - 1.)
        #         )
        #     )
        # logger.info("log_prob_z: " + str(log_prob_z))

        # (24.23) in Murphy, p. 842
        log_prob_z[:self.components.K] += self.components.log_post_pred(i)
        # Empty (unactive) components
        log_prob_z[self.components.K:] += self.components.log_prior(i)
        return _cython_utils.logsumexp(log_prob_z)
        # return logsumexp(log_prob_z)

    def gibbs_sample(self, n_iter, consider_unassigned=True,
            anneal_schedule=None, anneal_start_temp_inv=0.1,
            anneal_end_temp_inv=1, n_anneal_steps=-1): #, lms=1.0):
        """
        Perform `n_iter` iterations Gibbs sampling on the FBGMM.

        Parameters
        ----------
        consider_unassigned : bool
            Whether unassigned vectors (-1 in `assignments`) should be
            considered during sampling.
        anneal_schedule : str
            Can be one of the following:
            - None: A constant temperature of `anneal_end_temp_inv` is used
              throughout; if `anneal_end_temp_inv` is left at default (1), then
              this is equivalent to not performing annealing.
            - "linear": Linearly take the inverse temperature from
              `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps`. If `n_anneal_steps` is -1 for this schedule,
              annealing is performed over all `n_iter` iterations.
            - "step": Piecewise schedule in which the inverse temperature is
              taken from `anneal_start_temp_inv` to `anneal_end_temp_inv` in
              `n_anneal_steps` steps (annealing will be performed over all
              `n_iter` iterations; it might be worth adding an additional
              variable for this case to allow the step schedule to stop early).

        Return
        ------
        record_dict : dict
            Contains several fields describing the sampling process. Each field
            is described by its key and statistics are given in a list which
            covers the Gibbs sampling iterations.
        """

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        start_time = time.time()
        record_dict["log_marg"] = []
        record_dict["log_prob_z"] = []
        record_dict["log_prob_X_given_z"] = []
        record_dict["anneal_temp"] = []
        record_dict["components"] = []

        # Setup annealing iterator
        if anneal_schedule is None:
            get_anneal_temp = iter([])
        elif anneal_schedule == "linear":
            if n_anneal_steps == -1:
                n_anneal_steps = n_iter
            anneal_list = 1./np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            get_anneal_temp = iter(anneal_list)
        elif anneal_schedule == "step":
            assert not n_anneal_steps == -1, (
                "`n_anneal_steps` of -1 not allowed for step annealing schedule"
                )
            n_iter_per_step = int(round(float(n_iter)/n_anneal_steps))
            anneal_list = np.linspace(anneal_start_temp_inv, anneal_end_temp_inv, n_anneal_steps)
            anneal_list = 1./anneal_list
            anneal_list = np.repeat(anneal_list, n_iter_per_step)
            get_anneal_temp = iter(anneal_list)

        # Loop over iterations
        for i_iter in range(n_iter):

            # Get anneal temperature
            anneal_temp = next(get_anneal_temp, anneal_end_temp_inv)

            # Loop over data items
            for i in xrange(self.components.N):

                # Cache some old values for possible future use
                k_old = self.components.assignments[i]
                if not consider_unassigned and k_old == -1:
                    continue
                K_old = self.components.K
                stats_old = self.components.cache_component_stats(k_old)

                # Remove data vector `X[i]` from its current component
                self.components.del_item(i)

                # Compute log probability of `X[i]` belonging to each component
                # (24.26) in Murphy, p. 843
                log_prob_z = self.lms * (
                    np.ones(self.components.K_max)*np.log(
                        float(self.alpha)/self.components.K_max + self.components.counts
                        )
                    )
                # (24.23) in Murphy, p. 842
                log_prob_z[:self.components.K] += self.components.log_post_pred(i)
                # Empty (unactive) components
                log_prob_z[self.components.K:] += self.components.log_prior(i)
                if anneal_temp != 1:
                    log_prob_z = log_prob_z - logsumexp(log_prob_z)
                    log_prob_z_anneal = 1./anneal_temp * log_prob_z - logsumexp(1./anneal_temp * log_prob_z)
                    prob_z = np.exp(log_prob_z_anneal)
                else:
                    prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
                # prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

                # Sample the new component assignment for `X[i]`
                k = utils.draw(prob_z)

                # There could be several empty, unactive components at the end
                if k > self.components.K:
                    k = self.components.K
                # print prob_z, k, prob_z[k]

                # Add data item X[i] into its component `k`
                if k == k_old and self.components.K == K_old:
                    # Assignment same and no components have been removed
                    self.components.restore_component_from_stats(k_old, *stats_old)
                    self.components.assignments[i] = k_old
                else:
                    # Add data item X[i] into its new component `k`
                    self.components.add_item(i, k)

            # Update record
            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["log_prob_z"].append(self.log_prob_z())
            record_dict["log_prob_X_given_z"].append(self.log_prob_X_given_z())
            record_dict["anneal_temp"].append(anneal_temp)
            record_dict["components"].append(self.components.K)

            # Log info
            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            logger.info(info)

        return record_dict

    def gibbs_sample_inside_loop_i(self, i, anneal_temp=1): #, lms=1.):
        """
        Perform the inside loop of Gibbs sampling for data vector `i`.

        This is the inside of `gibbs_sample` and can be used by outside objects
        to perform only the inside loop part of the Gibbs sampling operation.
        The step in the loop is sample a new assignment for data vector `i`.
        The reason for not replacing the actual inner part of `gibbs_sample` by
        a call to this function is because this won't allow for caching the old
        component stats.
        """

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = self.lms * (
            np.ones(self.components.K_max)*np.log(
                float(self.alpha)/self.components.K_max + self.components.counts
                )
            )

        # (24.23) in Murphy, p. 842
        log_prob_z[:self.components.K] += self.components.log_post_pred(i)
        # Empty (unactive) components
        log_prob_z[self.components.K:] += self.components.log_prior(i)
        if anneal_temp != 1:
            log_prob_z = log_prob_z - logsumexp(log_prob_z)
            log_prob_z_anneal = 1./anneal_temp * log_prob_z - logsumexp(1./anneal_temp * log_prob_z)
            prob_z = np.exp(log_prob_z_anneal)
        else:
            prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
        # prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))
        assert not np.isnan(np.sum(prob_z))

        # Sample the new component assignment for `X[i]`
        k = utils.draw(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.components.K:
            k = self.components.K

        logger.debug("Adding item " + str(i) + " to acoustic model component " + str(k))
        self.components.add_item(i, k)

    def map_assign_i(self, i):
        """
        Assign data vector `i` to the component giving the maximum posterior.

        This function is very similar to `gibbs_sample_inside_loop_i`, but
        instead of sampling the assignment, the MAP estimate is used.
        """

        # Compute log probability of `X[i]` belonging to each component
        # (24.26) in Murphy, p. 843
        log_prob_z = (
            np.ones(self.components.K_max)*np.log(
                float(self.alpha)/self.components.K_max + self.components.counts
                )
            )
        # (24.23) in Murphy, p. 842
        log_prob_z[:self.components.K] += self.components.log_post_pred(i)
        # Empty (unactive) components
        log_prob_z[self.components.K:] += self.components.log_prior(i)
        prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

        # Take the MAP assignment for `X[i]`
        k = np.argmax(prob_z)

        # There could be several empty, unactive components at the end
        if k > self.components.K:
            k = self.components.K

        logger.debug("Adding item " + str(i) + " to acoustic model component " + str(k))
        self.components.add_item(i, k)

    def get_n_assigned(self):
        """Return the number of assigned data vectors."""
        return len(np.where(self.components.assignments != -1)[0])
コード例 #3
0
class FBGMM(object):
    """
    A finite Bayesian Gaussian mixture model (FBGMM).

    See `GaussianComponents` or `GaussianComponentsDiag` for an overview of the
    parameters not mentioned below.

    Parameters
    ----------
    alpha : float
        Concentration parameter for the symmetric Dirichlet prior over the
        mixture weights.
    K : int
        The number of mixture components. This is actually a maximum number,
        and it is possible to empty out some of these components.
    assignments : vector of int or str
        If vector of int, this gives the initial component assignments. The
        vector should therefore have N entries between 0 and `K`. Values of
        -1 is also allowed, indicating that the data vector does not belong to
        any component. Alternatively, `assignments` can take one of the
        following values:
        - "rand": Vectors are assigned randomly to one of `K` components.
        - "each-in-own": Each vector is assigned to a component of its own.
    covariance_type : str
        String describing the type of covariance parameters to use. Must be
        one of "full", "diag" or "fixed".
    """

    def __init__(
            self, X, prior, alpha, K, assignments="rand",
            covariance_type="full"
            ):

        self.alpha = alpha
        N, D = X.shape

        # Initial component assignments
        if assignments == "rand":
            assignments = np.random.randint(0, K, N)

            # Make sure we have consequetive values
            for k in xrange(assignments.max()):
                while len(np.nonzero(assignments == k)[0]) == 0:
                    assignments[np.where(assignments > k)] -= 1
                if assignments.max() == k:
                    break
        elif assignments == "each-in-own":
            assignments = np.arange(N)
        else:
            # assignments is a vector
            pass

        if covariance_type == "full":
            self.components = GaussianComponents(X, prior, assignments, K_max=K)
        elif covariance_type == "diag":
            self.components = GaussianComponentsDiag(X, prior, assignments, K_max=K)
        elif covariance_type == "fixed":
            self.components = GaussianComponentsFixedVar(X, prior, assignments, K_max=K)
        else:
            assert False, "Invalid covariance type."

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""

        # Log probability of component assignment, (24.24) in Murphy, p. 842
        log_prob_z = (
            gammaln(self.alpha)
            - gammaln(self.alpha + np.sum(self.components.counts))
            + np.sum(
                gammaln(
                    self.components.counts
                    + float(self.alpha)/self.components.K_max
                    )
                - gammaln(self.alpha/self.components.K_max)
                )
            )

        # Log probability of data in each component
        log_prob_X_given_z = self.components.log_marg()

        return log_prob_z + log_prob_X_given_z

    def gibbs_sample(self, n_iter):
        """
        Perform `n_iter` iterations Gibbs sampling on the FBGMM.

        A record dict is constructed over the iterations, which contains
        several fields describing the sampling process. Each field is described
        by its key and statistics are given in a list which covers the Gibbs
        sampling iterations. This dict is returned.
        """

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        start_time = time.time()
        record_dict["log_marg"] = []
        record_dict["components"] = []

        # Loop over iterations
        for i_iter in range(n_iter):

            # Loop over data items
            for i in xrange(self.components.N):

                # Cache some old values for possible future use
                k_old = self.components.assignments[i]
                K_old = self.components.K
                stats_old = self.components.cache_component_stats(k_old)

                # Remove data vector `X[i]` from its current component
                self.components.del_item(i)

                # Compute log probability of `X[i]` belonging to each component
                # (24.26) in Murphy, p. 843
                log_prob_z = (
                    np.ones(self.components.K_max)*np.log(
                        float(self.alpha)/self.components.K_max + self.components.counts
                        )
                    )
                # (24.23) in Murphy, p. 842
                log_prob_z[:self.components.K] += self.components.log_post_pred(i)
                # Empty (unactive) components
                log_prob_z[self.components.K:] += self.components.log_prior(i)
                prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

                # Sample the new component assignment for `X[i]`
                k = utils.draw(prob_z)

                # There could be several empty, unactive components at the end
                if k > self.components.K:
                    k = self.components.K
                # print prob_z, k, prob_z[k]

                # Add data item X[i] into its component `k`
                if k == k_old and self.components.K == K_old:
                    # Assignment same and no components have been removed
                    self.components.restore_component_from_stats(k_old, *stats_old)
                    self.components.assignments[i] = k_old
                else:
                    # Add data item X[i] into its new component `k`
                    self.components.add_item(i, k)

            # Update record
            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["components"].append(self.components.K - 1)

            # Log info
            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            info += "."
            logger.info(info)

        return record_dict
コード例 #4
0
class IGMM(object):
    """
    An infinite Gaussian mixture model (IGMM).

    See `GaussianComponents` for an overview of the parameters not mentioned
    below.

    Parameters
    ----------
    alpha : float
        Concentration parameter for the Dirichlet process.
    assignments : vector of int or str
        If vector of int, this gives the initial component assignments. The
        vector should therefore have N entries between 0 and `K`. Values of
        -1 is also allowed, indicating that the data vector does not belong to
        any component. Alternatively, `assignments` can take one of the
        following values:
        - "rand": Vectors are assigned randomly to one of `K` components.
        - "one-by-one": Vectors are assigned one at a time; the value of
          `K` becomes irrelevant.
        - "each-in-own": Each vector is assigned to a component of its own.
    K : int
        The initial number of mixture components: this is only used when
        `assignments` is "rand".
    covariance_type : str
        String describing the type of covariance parameters to use. Must be
        one of "full", "diag" or "fixed".
    """

    def __init__(
            self, X, prior, alpha, assignments="rand", K=1, K_max=None,
            covariance_type="full"
            ):

        self.alpha = alpha
        N, D = X.shape

        # Initial component assignments
        if assignments == "rand":
            assignments = np.random.randint(0, K, N)

            # Make sure we have consequetive values
            for k in xrange(assignments.max()):
                while len(np.nonzero(assignments == k)[0]) == 0:
                    assignments[np.where(assignments > k)] -= 1
                if assignments.max() == k:
                    break
        elif assignments == "one-by-one":
            assignments = -1*np.ones(N, dtype="int")
            assignments[0] = 0  # first data vector belongs to first component
        elif assignments == "each-in-own":
            assignments = np.arange(N)
        else:
            # assignments is a vector
            pass

        if covariance_type == "full":
            self.components = GaussianComponents(X, prior, assignments, K_max)
        elif covariance_type == "diag":
            self.components = GaussianComponentsDiag(X, prior, assignments, K_max)
        elif covariance_type == "fixed":
            self.components = GaussianComponentsFixedVar(X, prior, assignments, K_max)
        else:
            assert False, "Invalid covariance type."

    def log_marg(self):
        """Return log marginal of data and component assignments: p(X, z)"""

        # Log probability of component assignment P(z|alpha)
        # Equation (10) in Wood and Black, 2008
        # Use \Gamma(n) = (n - 1)!
        facts_ = gammaln(self.components.counts[:self.components.K])
        facts_[self.components.counts[:self.components.K] == 0] = 0  # definition of log(0!)
        log_prob_z = (
            (self.components.K - 1)*math.log(self.alpha) + gammaln(self.alpha)
            - gammaln(np.sum(self.components.counts[:self.components.K])
            + self.alpha) + np.sum(facts_)
            )

        log_prob_X_given_z = self.components.log_marg()

        return log_prob_z + log_prob_X_given_z

    # @profile
    def gibbs_sample(self, n_iter):
        """
        Perform `n_iter` iterations Gibbs sampling on the IGMM.

        A record dict is constructed over the iterations, which contains
        several fields describing the sampling process. Each field is described
        by its key and statistics are given in a list which covers the Gibbs
        sampling iterations. This dict is returned.
        """

        # Setup record dictionary
        record_dict = {}
        record_dict["sample_time"] = []
        start_time = time.time()
        record_dict["log_marg"] = []
        record_dict["components"] = []

        # Loop over iterations
        for i_iter in range(n_iter):

            # Loop over data items
            # import random
            # permuted = range(self.components.N)
            # random.shuffle(permuted)
            # for i in permuted:
            for i in xrange(self.components.N):

                # Cache some old values for possible future use
                k_old = self.components.assignments[i]
                K_old = self.components.K
                stats_old = self.components.cache_component_stats(k_old)

                # Remove data vector `X[i]` from its current component
                self.components.del_item(i)

                # Compute log probability of `X[i]` belonging to each component
                log_prob_z = np.zeros(self.components.K + 1, np.float)
                # (25.35) in Murphy, p. 886
                log_prob_z[:self.components.K] = np.log(self.components.counts[:self.components.K])
                # (25.33) in Murphy, p. 886
                log_prob_z[:self.components.K] += self.components.log_post_pred(i)
                # Add one component to which nothing has been assigned
                log_prob_z[-1] = math.log(self.alpha) + self.components.cached_log_prior[i]
                prob_z = np.exp(log_prob_z - logsumexp(log_prob_z))

                # Sample the new component assignment for `X[i]`
                k = utils.draw(prob_z)
                # logger.debug("Sampled k = " + str(k) + " from " + str(prob_z) + ".")

                # Add data item X[i] into its component `k`
                if k == k_old and self.components.K == K_old:
                    # Assignment same and no components have been removed
                    self.components.restore_component_from_stats(k_old, *stats_old)
                    self.components.assignments[i] = k_old
                else:
                    # Add data item X[i] into its new component `k`
                    self.components.add_item(i, k)

            # Update record
            record_dict["sample_time"].append(time.time() - start_time)
            start_time = time.time()
            record_dict["log_marg"].append(self.log_marg())
            record_dict["components"].append(self.components.K - 1)

            # Log info
            info = "iteration: " + str(i_iter)
            for key in sorted(record_dict):
                info += ", " + key + ": " + str(record_dict[key][-1])
            info += "."
            logger.info(info)

        return record_dict