Esempio n. 1
0
 def test_exp_m_step(self):
     """_exp_m_step is symmetric."""
     a = numpy.random.random(size=(self.n_labellers, 1))
     b = numpy.random.random(size=(self.n_labellers, 1))
     w = numpy.random.random(size=(self.n_features + 1))
     x = numpy.hstack([self.x, numpy.ones((self.n_examples, 1))])
     y_mask = self.y.mask
     m = self.rc._exp_m_step(a, b, w, x, self.y.filled(0), y_mask)
     w_flip = scipy.optimize.fmin_bfgs(
         lambda k: ((logistic_regression(w, x) - 1 +
                     logistic_regression(k, x)) ** 2 +
                    0.000001 * numpy.linalg.norm(k)).sum(), w, disp=False)
     m_ = self.rc._exp_m_step(b, a, w_flip, x, 1 - self.y.filled(1), y_mask)
     self.assertTrue(numpy.allclose(m, 1 - m_, atol=1e-3))
Esempio n. 2
0
    def _em_step(self, n_samples, n_annotators, n_dim, a, w, x, y):
        # Expectation step.
        # Posterior for each i. p(z_i = 1 | x_i, y_i).
        lr = logistic_regression(a, x)
        posteriors = lr.copy()
        posteriors *= self._annotator_model(w, x, y, 1).prod(axis=0)

        # Repeat for p(z_i = 0 | x_i, y_i).
        posteriors_0 = 1 - lr
        posteriors_0 *= self._annotator_model(w, x, y, 0).prod(axis=0)

        # We want to normalise. We want p(z = 1) + p(z = 0) == 1.
        # Currently, p(z = 1) + p(z = 0) == q.
        # :. Divide p(z = 1) and p(z = 0) by q.
        total = posteriors + posteriors_0
        # It's apparently possible for both of these to be 0. That's really
        # strange, but if that happens we'll set them both to 0.5.
        posteriors[total == 0] = 0.5
        posteriors_0[total == 0] = 0.5
        total[total == 0] = 1
        posteriors /= total
        posteriors_0 /= total
        assert numpy.allclose(posteriors, 1 - posteriors_0), (posteriors, posteriors_0)

        # Maximisation step.
        theta = self._pack(a, w)
        theta_, fv, inf = scipy.optimize.fmin_l_bfgs_b(
            self._Q, x0=theta, approx_grad=False, args=(n_dim, n_annotators, n_samples, posteriors, posteriors_0, x, y)
        )
        logging.debug("Terminated with Q = %4f", fv)
        logging.debug(inf["task"].decode("ascii"))
        a_, w_, = self._unpack(theta_, n_dim, n_annotators)

        return a_, w_
Esempio n. 3
0
    def predict_proba(self, x):
        """Predict probabilities of data points using logistic regression.

        x: Data points. (n_samples, n_dim) NumPy array.
        """
        x = numpy.hstack([x, numpy.ones((x.shape[0], 1))])
        return logistic_regression(self.a_, x)
Esempio n. 4
0
    def predict_proba(self, x):
        """Predict probabilities of data points using logistic regression.

        x: Data points. (n_samples, n_dim) NumPy array.
        """
        x = numpy.hstack([x, numpy.ones((x.shape[0], 1))])
        return logistic_regression(self.a_, x)
Esempio n. 5
0
    def _Q(self, params, n_dim, n_annotators, n_samples, posteriors, posteriors_0, x, y):
        """Maximisation step minimisation target."""
        a, w = self._unpack(params, n_dim, n_annotators)

        expectation = (
            posteriors.dot(
                (numpy.log(self._annotator_model(w, x, y, 1) + EPS) + numpy.log(logistic_regression(a, x) + EPS)).T
            )
            + posteriors_0.dot(
                (numpy.log(self._annotator_model(w, x, y, 0) + EPS) + numpy.log(1 - logistic_regression(a, x) + EPS)).T
            )
        ).sum()

        # Also need the gradients.
        dQ_da = n_annotators * (
            numpy.dot(posteriors * logistic_regression(-a, x) + posteriors_0 * (logistic_regression(-a, x) - 1), x)
        )

        dQ_dw = numpy.zeros(w.shape)
        # Inefficient, but unrolled for clarity.
        # TODO(MatthewJA): Speed this up. (Numba?)
        for t in range(n_annotators):
            dQ_dw[t] += sum(
                x[i] * posteriors[i] * (logistic_regression(-w[t], x[i]) - abs(y[t, i] - 1))
                + x[i] * posteriors_0[i] * (logistic_regression(-w[t], x[i]) - abs(y[t, i] - 0))
                for i in range(n_samples)
            )
        grad = self._pack(dQ_da, dQ_dw)

        return -expectation, -grad
Esempio n. 6
0
    def _Q(self, params, n_dim, n_annotators, n_samples, posteriors,
           posteriors_0, x, y):
        """Maximisation step minimisation target."""
        a, w = self._unpack(params, n_dim, n_annotators)

        expectation = (posteriors.dot(
            (numpy.log(self._annotator_model(w, x, y, 1) + EPS) +
             numpy.log(logistic_regression(a, x) + EPS)).T) + posteriors_0.dot(
                 (numpy.log(self._annotator_model(w, x, y, 0) + EPS) +
                  numpy.log(1 - logistic_regression(a, x) + EPS)).T)).sum()

        # Also need the gradients.
        dQ_da = n_annotators * (numpy.dot(
            posteriors * logistic_regression(-a, x) + posteriors_0 *
            (logistic_regression(-a, x) - 1), x))

        dQ_dw = numpy.zeros(w.shape)
        # Inefficient, but unrolled for clarity.
        # TODO(MatthewJA): Speed this up. (Numba?)
        for t in range(n_annotators):
            dQ_dw[t] += sum(
                x[i] * posteriors[i] *
                (logistic_regression(-w[t], x[i]) - abs(y[t, i] - 1)) +
                x[i] * posteriors_0[i] *
                (logistic_regression(-w[t], x[i]) - abs(y[t, i] - 0))
                for i in range(n_samples))
        grad = self._pack(dQ_da, dQ_dw)

        return -expectation, -grad
Esempio n. 7
0
    def _annotator_model(self, w, x, y, z):
        """Yan et al. (2010) Bernoulli annotator model.

        w: Annotator weights w_t. (n_dim,) NumPy array
        x: Data point x_i. (n_dim,) NumPy array
        y: Label y_i^(t). int
        z: "True" label z_i. int
        -> float in [0, 1]
        """
        eta = logistic_regression(w, x)
        label_difference = numpy.abs(y - z)
        anno = (numpy.power(1 - eta, label_difference.T) * numpy.power(eta, 1 - label_difference.T)).T
        assert (anno >= 0).all()
        return anno
Esempio n. 8
0
    def _annotator_model(self, w, x, y, z):
        """Yan et al. (2010) Bernoulli annotator model.

        w: Annotator weights w_t. (n_dim,) NumPy array
        x: Data point x_i. (n_dim,) NumPy array
        y: Label y_i^(t). int
        z: "True" label z_i. int
        -> float in [0, 1]
        """
        eta = logistic_regression(w, x)
        label_difference = numpy.abs(y - z)
        anno = (numpy.power(1 - eta, label_difference.T) *
                numpy.power(eta, 1 - label_difference.T)).T
        assert (anno >= 0).all()
        return anno
Esempio n. 9
0
    def _likelihood(self, w, a, b, X, Y_0, Y_1):
        """Computes the likelihood of labels and data under a model.

        X: (n_samples, n_features) NumPy array of data.
        Y: (n_labellers, n_samples) NumPy masked array of crowd labels.
        """
        n_examples = X.shape[0]
        exp_p = logistic_regression(w, X)
        exp_a = numpy.ones((n_examples,))
        exp_b = numpy.ones((n_examples,))
        exp_a = numpy.power(a, Y_0).prod(axis=0)
        exp_a *= numpy.power(1 - a, 1 - Y_1).prod(axis=0)
        exp_b *= numpy.power(b, 1 - Y_1).prod(axis=0)
        exp_b *= numpy.power(1 - b, Y_0).prod(axis=0)

        return (exp_a * exp_p.T + exp_b * (1 - exp_p).T).prod()
Esempio n. 10
0
    def _likelihood(self, w, a, b, X, Y_0, Y_1):
        """Computes the likelihood of labels and data under a model.

        X: (n_samples, n_features) NumPy array of data.
        Y: (n_labellers, n_samples) NumPy masked array of crowd labels.
        """
        n_examples = X.shape[0]
        exp_p = logistic_regression(w, X)
        exp_a = numpy.ones((n_examples, ))
        exp_b = numpy.ones((n_examples, ))
        exp_a = numpy.power(a, Y_0).prod(axis=0)
        exp_a *= numpy.power(1 - a, 1 - Y_1).prod(axis=0)
        exp_b *= numpy.power(b, 1 - Y_1).prod(axis=0)
        exp_b *= numpy.power(1 - b, Y_0).prod(axis=0)

        return (exp_a * exp_p.T + exp_b * (1 - exp_p).T).prod()
Esempio n. 11
0
    def _likelihood(self, a, w, X, Y):
        """Computes the likelihood of labels and data under a model.

        X: (n_samples, n_features) NumPy array of data.
        Y: (n_labellers, n_samples) NumPy masked array of crowd labels.
        """
        lh = 1
        for i in range(X.shape[0]):
            for t in range(Y.shape[0]):
                if Y.mask[t, i]:
                    continue

                lr = logistic_regression(a, X[i])
                p1 = self._annotator_model(w[t], X[i], Y[t, i], 1) * lr
                p0 = self._annotator_model(w[t], X[i], Y[t, i], 0) * (1 - lr)
                lh *= p1 + p0

        return lh
Esempio n. 12
0
    def _likelihood(self, a, w, X, Y):
        """Computes the likelihood of labels and data under a model.

        X: (n_samples, n_features) NumPy array of data.
        Y: (n_labellers, n_samples) NumPy masked array of crowd labels.
        """
        lh = 1
        for i in range(X.shape[0]):
            for t in range(Y.shape[0]):
                if Y.mask[t, i]:
                    continue

                lr = logistic_regression(a, X[i])
                p1 = self._annotator_model(w[t], X[i], Y[t, i], 1) * lr
                p0 = self._annotator_model(w[t], X[i], Y[t, i], 0) * (1 - lr)
                lh *= p1 + p0

        return lh
Esempio n. 13
0
    def _exp_m_step(self, a, b, w, x, y, y_mask):
        """Computes expectation value of μ."""
        lr = logistic_regression(w, x)
        exp_a = numpy.ones((x.shape[0], ))
        exp_b = numpy.ones((x.shape[0], ))
        for t in range(a.shape[0]):
            for i in range(x.shape[0]):
                if y_mask[t, i]:
                    continue

                exp_a[i] *= a[t]**y[t, i] * (1 - a[t])**(1 - y[t, i])
                exp_b[i] *= b[t]**(1 - y[t, i]) * (1 - b[t])**y[t, i]

        logging.debug('Average a_i: {:.02}'.format(exp_a.mean()))
        logging.debug('Average alpha_t: {:.02}'.format(a.mean()))
        logging.debug('Max alpha_t: {}'.format(a.max()))
        logging.debug('Min alpha_t: {}'.format(a.min()))

        return exp_a * lr / (exp_a * lr + exp_b * (1 - lr) + EPS)
Esempio n. 14
0
    def _exp_m_step(self, a, b, w, x, y, y_mask):
        """Computes expectation value of μ."""
        lr = logistic_regression(w, x)
        exp_a = numpy.ones((x.shape[0],))
        exp_b = numpy.ones((x.shape[0],))
        for t in range(a.shape[0]):
            for i in range(x.shape[0]):
                if y_mask[t, i]:
                    continue

                exp_a[i] *= a[t] ** y[t, i] * (1 - a[t]) ** (1 - y[t, i])
                exp_b[i] *= b[t] ** (1 - y[t, i]) * (1 - b[t]) ** y[t, i]

        logging.debug('Average a_i: {:.02}'.format(exp_a.mean()))
        logging.debug('Average alpha_t: {:.02}'.format(a.mean()))
        logging.debug('Max alpha_t: {}'.format(a.max()))
        logging.debug('Min alpha_t: {}'.format(a.min()))

        return exp_a * lr / (exp_a * lr + exp_b * (1 - lr) + EPS)
Esempio n. 15
0
    def _em_step(self, n_samples, n_annotators, n_dim, a, w, x, y):
        # Expectation step.
        # Posterior for each i. p(z_i = 1 | x_i, y_i).
        lr = logistic_regression(a, x)
        posteriors = lr.copy()
        posteriors *= self._annotator_model(w, x, y, 1).prod(axis=0)

        # Repeat for p(z_i = 0 | x_i, y_i).
        posteriors_0 = 1 - lr
        posteriors_0 *= self._annotator_model(w, x, y, 0).prod(axis=0)

        # We want to normalise. We want p(z = 1) + p(z = 0) == 1.
        # Currently, p(z = 1) + p(z = 0) == q.
        # :. Divide p(z = 1) and p(z = 0) by q.
        total = posteriors + posteriors_0
        # It's apparently possible for both of these to be 0. That's really
        # strange, but if that happens we'll set them both to 0.5.
        posteriors[total == 0] = 0.5
        posteriors_0[total == 0] = 0.5
        total[total == 0] = 1
        posteriors /= total
        posteriors_0 /= total
        assert numpy.allclose(posteriors, 1 - posteriors_0), \
                             (posteriors, posteriors_0)

        # Maximisation step.
        theta = self._pack(a, w)
        theta_, fv, inf = scipy.optimize.fmin_l_bfgs_b(
            self._Q,
            x0=theta,
            approx_grad=False,
            args=(n_dim, n_annotators, n_samples, posteriors, posteriors_0, x,
                  y))
        logging.debug('Terminated with Q = %4f', fv)
        logging.debug(inf['task'].decode('ascii'))
        a_, w_, = self._unpack(theta_, n_dim, n_annotators)

        return a_, w_
Esempio n. 16
0
 def predict_proba(self, X):
     X = numpy.hstack([X, numpy.ones((X.shape[0], 1))])
     return logistic_regression(self.w_, X)
Esempio n. 17
0
 def predict_proba(self, X):
     X = numpy.hstack([X, numpy.ones((X.shape[0], 1))])
     return logistic_regression(self.w_, X)