Exemple #1
0
    def init_bfgs(self, X, y):
        """
        Initializes the BFGS algorithm

        Parameters
        ----------
        X : N x D matrix composed of numerical features, where each feature is 1 x D
        y : N X 1 matrix, where y is a bit vector corresponding to whether each of the
        N features is in class 1 or 0

        Returns
        -------
        W : initial weights
        B : 2 x D x D "matrix", where the first is the initial pseudo-Hessian matrix and
        the second is empty
        G : Gradient matrix containing the initial gradient vector and an empty slot for the next
        gradient vector

        """
        W = np.zeros(shape=(2, X.shape[1]))  # initializing weights
        B = np.zeros(shape=(2, X.shape[1], X.shape[1]))

        B[0] = np.diag(np.ones(shape=X.shape[1])
                       )  # Initializing pseudo-hessian to identity matrix
        pi = expit(safedot(X, W[0]))

        G = np.empty(shape=(2, X.shape[1]))
        G[0] = safedot(X.T, (pi - y))
        return W, B, G
    def l2_distance(self, X, x, gen_dist, kernel, **kwargs):
        """
        Computes Euclidean distance between a single feature observation from testing
        data and each observation from training data

        Parameters
        ----------
        X : N x D matrix consisting of N observations of data
        x : D x 1 vector consisting of a single observation
        gen_dist : pre computed squared norm of each row vector in X
        kernel : function that will apply a pseudo projection of the features into a space
        of different dimensions

        Returns
        -------
        N x 1 vector of distances between x and each observation in X
        """

        if kernel is None:
            t_1 = gen_dist
            t_2 = safedot(x, x)
            t_3 = 2 * safedot(X, x)
            return t_1 + t_2 - t_3
        else:
            distances = np.zeros(X.shape[0])
            for i in range(X.shape[0]):
                t_2 = kernel(x, x, **kwargs)
                t_3 = kernel(X[i], x, **kwargs)
                distances[i] = gen_dist[i] + t_2 - (2 * t_3)
            return distances
    def gaussian(self, x, x_, **kwargs):
        """
        Computes the Gaussian(RBF) kernel of the two given vectors

        Parameters
        ----------
        x : D x 1 feature vector
        x_ : D x 1 feature vector
        sigma : parameter controlling the bandwith of the kernel

        Returns
        -------
        kernelized inner product of the two given vectors
        """
        if kwargs:
            try:
                sigma = kwargs['sigma']
            except KeyError:
                raise ValueError(
                    'Must use proper parameters of Gaussian(RBF) kernel')
        else:
            sigma = 30

        gamma = 1 / (2 * np.square(sigma))
        sq_norm = safedot(x, x) + safedot(x_, x_) - (2 * safedot(x, x_))
        n = gamma * sq_norm
        return np.exp(-n)
Exemple #4
0
    def conjugate_gradient(self, A, b, epsilon, x=None):
        """
        Solves linear equation Ax = b using conjugate gradients. This Algorithm
        can be found on pg. 111 of "Numerical Analysis"

        Parameters
        ----------
        A : In the context of newton methods, A is the Hessian matrix
        b : In the context of newton methods, b is the gradient vector
        epsilon : Since covergence to 0 would be a waste of computational resources, epsilon is used
        to determine if the residual is "close-enough" to 0
        x : can be given if there is a desired starting 

        Returns
        -------
        x : approximate solution to the linear equation of the form Ax = b
        """
        if x is None:
            x = np.ones(b.shape[0])  # Initialize random x
        r_0 = safedot(A, x) - b  # Calculating residual
        p = -r_0  # original direction

        while la.norm(r_0) > epsilon:
            alpha = safedot(r_0, r_0) / safedot(safedot(p, A),
                                                p)  # Creating 1-D minimizer

            x = x + (alpha * p)  # Updating x
            r_1 = r_0 + (alpha * safedot(A, p))  # Updating residual
            beta = safedot(r_1, r_1) / safedot(
                r_0,
                r_0)  # matrix to ensure conjugacy between new direct p and A

            p = -r_1 + safedot(beta, p)  # updating direction
            r_0 = r_1
        return x
Exemple #5
0
    def get_bernoulli_likelihood(self, X):
        """
        Calculates the likelihood of each observation in X being in a
        class given that the features in X are either 0 or 1

        Parameters
        ----------
        X : N x D matrix of 1 x D feature vectors
        """
        log_p = np.log(self.p_matrix)
        log_p_not = np.log(1 - self.p_matrix)

        a = safedot(1 - X, log_p_not.T)
        b = safedot(X, log_p.T)
        self.predictions = np.argmax(np.log(self.priors) + a + b, axis = 1)
Exemple #6
0
    def fit(self, X, y, max_iters=10000, save_weights=False, epsilon=1e-5):
        """
        Finds optimal weights by adjusting them only when incorrect observations are made

        Parameters
        ----------
        X : N x D matrix composed of numerical features, where each feature is 1 x D
        y : N X 1 matrix, where y is a bit vector corresponding to whether each of the
        N features is in class 1 or 0
        max_iters : maximum number of iterations before the algorithm will terminate
        save_weights : whether or not the weights obtain from each iteration should be saved
        epsilon : small number to be used to test for approximate convergence when determining
        the direction to be descended
        """
        if not np.allclose(np.unique(y), np.array([-1, 1])):
            y[y == 0] = -1

        w = -np.ones(shape=X.shape[1]) + np.finfo('float').resolution
        W = np.zeros(shape=(max_iters, w.shape[0]))
        s = np.zeros(w.shape[0])
        for i in range(max_iters):
            idx = i % (X.shape[0] - 1)
            x = X[idx]
            y_hat = np.sign(safedot(x, w))

            # Only updates when an incorrect predict is made
            if not np.allclose(y_hat, y[idx]):
                w += y[idx] * x

            W[i] = w
            if i > 50 and la.norm(W[i] - W[i - 50]) < epsilon:
                break
        self.w = w
        self.W = W
    def polynomial(self, x, x_, **kwargs):
        """
        Computes the polynomial kernel of the two given vectors

        Parameters
        ----------
        x : D x 1 feature vector
        x_ : D x 1 feature vector
        c : C >= 0 is a free parameter that controls the influence of higher order
        terms versus lower order terms in polynomial
        d : scaling degree

        Returns
        -------
        Kernelized inner product of the two given vectors
        """
        if kwargs:
            try:
                c = kwargs['c']
                d = kwargs['d']
            except KeyError:
                raise ValueError(
                    'Must use proper arguments for polynomial kernel')
        else:
            c = 1
            d = 2

        n = safedot(x, x_) + c
        return np.power(n, d)
Exemple #8
0
    def irls(self, X, y, iterations=50, l2_reg=0, save_weights=False):
        """
        Solving the likelihood function using Iteratively Reweighted Least Squares method

        Parameters
        ----------
        X : N x D matrix composed of numerical features, where each feature is 1 x D
        y : N X 1 matrix, where y is a bit vector corresponding to whether each of the
        N features is in class 1 or 0
        max_iters : maximum number of iterations before the algorithm will terminate
        l2_reg : amount of l2 regularization to be applied
        save_weights : whether or not the weights obtain from each iteration should be saved

        """

        w = np.zeros(X.shape[1])
        prediction = np.ones(X.shape[1])
        if save_weights:
            W = np.empty(shape=((iterations), w.shape[0]))
        i = 0
        while i < iterations:
            n = safedot(X, w)
            prediction = expit(
                n)  # predicting new y values based on current weights
            s = prediction * (1 - prediction)
            s[s == 0] = np.finfo(
                'float'
            ).resolution  # Ensuring that matrix S will be invertible
            S = np.diag(s)

            z = n + self.score_irls(y, prediction, S)  # response variable
            if np.allclose(z, n):
                if (save_weights):
                    W = W[:i]
                break
            w_0 = la.inv(la.multi_dot((X.T, S, X)))
            w_1 = la.multi_dot((X.T, S, z))

            w_n = safedot(w_0, w_1)
            w = w_n + (l2_reg * w)
            if save_weights:
                W[i] = w
            i += 1
        self.w = w
        if save_weights:
            self.W = W
Exemple #9
0
    def update_B(self, B, W, G):
        """
        Performs the update of the pseudo-Hessian Matrix

        Parameters
        ----------
        B : 2 x D x D "martix" containing pseudo-Hessian matrix and an empty slot for the next one
        W : 2 x D matrix containing the two weights calculated from the past two iterations of
        the BFGS algorithm
        G : 2 x D matrix containing the two gradients calculated from the past two iterations
        of the BFGS algorithm

        Returns
        -------
        B : 2 x D x D matrix containing past two pseudo hessian matrices
        """

        dW = W[1] - W[0]
        dG = G[1] - G[0]
        t_1a = safedot(dG, dG)
        t_1b = safedot(dG, dW)
        if not t_1b:
            t_1 = 0
        else:
            t_1 = t_1a / t_1b
        t_2a = safedot(safedot(B[0], dW), safedot(B[0], dW))
        t_2b = safedot(safedot(dW, B[0]), dW)
        B[1] = B[0] + t_1 + (t_2a / t_2b)
        return B[1]
Exemple #10
0
    def classify(self, X):
        """
        Uses weights calculating from fitting to form classification of new features

        Parameters
        ----------
        X : M x D matrix composed of numerical features, where each feature is 1 x D
        """
        preds = np.sign(safedot(X, self.w))
        preds[preds < 1] = 0
        self.predictions = preds
Exemple #11
0
    def bfgs(self, X, y, iterations=20, save_weights=False, t=0, k=.12):
        """
        Implementation of the Broyden–Fletcher–Goldfarb–Shanno algorithm

        Parameters
        ----------
        X : N x D matrix composed of numerical features, where each feature is 1 x D
        y : N X 1 matrix, where y is a bit vector corresponding to whether each of the
        N features is in class 1 or 0
        max_iters : maximum number of iterations before the algorithm will terminate
        save_weights : whether or not the weights obtain from each iteration should be saved
        k : must be in range (0,1) and larger values will decrease step sizes more
        t : larger values will decrease the step size taken

        Note
        ----
        Significantly faster convergence with normalized X
        """
        W, B, G = self.init_bfgs(X, y)
        weights = np.empty(shape=(iterations, W.shape[1]))

        for i in range(iterations):
            n = safedot(X, W[0])
            G[1] = (safedot(X.T, expit(n) - y)) / G.shape[1]
            d = safedot(la.pinv(B[0]), G[1])

            a = self.step_size(iter_num=(i + 1) * 10, t=t, k=k)

            W[1] = W[0] - (a * d)

            B[0] = self.update_B(B, W, G)

            G[0] = G[1]
            W[0] = W[1]

            if save_weights:
                weights[i] = W[0]

        self.w = W[0]
        if save_weights:
            self.W = weights
Exemple #12
0
    def binary_newton_cg(self,
                         X,
                         y,
                         max_iters=10,
                         l2_reg=0,
                         save_weights=False,
                         epsilon=1e-4):
        """
        Solves the optimization of the likelihood function using newton's 
        method with conjugate gradients

        Parameters
        ----------
        X : N x D matrix composed of numerical features, where each feature is 1 x D
        y : N X 1 matrix, where y is a bit vector corresponding to whether each of the
        N features is in class 1 or 0
        max_iters : maximum number of iterations before the algorithm will terminate
        l2_reg : determines the amount of regularization that will be applied at each iteration
        save_weights : whether or not the weights obtain from each iteration should be saved
        epsilon : small number to be used to test for approximate convergence when determining
        the direction to be descended
        """
        w = np.zeros(X.shape[1])
        if save_weights:
            W = np.zeros(shape=(max_iters, X.shape[1]))
        for i in range(max_iters):

            mu = expit(safedot(X, w))  # Calculating predicted probabilities

            g = safedot(X.T, (mu - y))  # gradient vector
            H = safedot(safedot(X.T, np.diag(mu)), X)  # hessian matrix

            n = self.conjugate_gradient(H, g, epsilon)  #

            w = (w - n) + (l2_reg * w
                           )  # updating weights with l2 regularization
            if save_weights:
                W[i] = w
        if save_weights:
            self.W = W  # weights of each iteration
        self.w = w  # final weights
Exemple #13
0
    def create_variables(self, X):
        """
        Defining variables to make softmax function more convienient in the future

        Parameters
        ----------
        X : N x D matrix of 1 x D feature vectors
        """
        try:
            inv_sigma = la.inv(self.pooled_sigma)
        except:
            inv_sigma = la.pinv(self.pooled_sigma)

        self.gamma = np.zeros(shape=self.priors.shape[0])
        self.beta = np.zeros(shape=(self.gamma.shape[0], X.shape[1]))

        for i in range(self.priors.shape[0]):
            a = safedot(self.means[i], inv_sigma)
            b = safedot(a, self.means[i]) / (-2)
            self.gamma[i] = b + np.log(self.priors[i])
            self.beta[i] = safedot(inv_sigma, self.means[i])
Exemple #14
0
    def get_multinomial_likelihood(self, X):
        """
        Calculates the likelihood of each observation in X being in a
        class given that the features in X are of a multinomial distribution

        Parameters
        ----------
        X : N x D matrix of 1 x D feature vectors
        """
        ll = safedot(X, self.l_p_matrix.T)
        l_prior = np.log(self.priors)
        self.predictions = np.argmax(ll + l_prior, axis = 1)
Exemple #15
0
    def classify(self, X):
        """
        Performs classification of observations depending on selected model

        Parameters
        ----------
        X : N x D matrix of 1 x D feature vectors 
        """
        if (self.model == 'Linear'):
            n = safedot(X, self.beta.T) + self.gamma
            self.predictions = np.argmax(self.softmax(n), axis=1)

        elif (self.model == 'Quadratic' or self.model == 'Regularized'):
            t_1 = (-0.5) * self.determinants
            t_2 = (-0.5) * self.compute_mahalanobis(X)
            self.predictions = np.argmax(t_1 + t_2, axis=1)
Exemple #16
0
    def score_irls(self, y, pi, S):
        """
        Gives the working response after each update it irls algorithm

        Parameters
        ----------
        y : Actual label for each feature vector
        pi : Prediction from [0,1], where closer proximity to 0 or 1 means it is more likely for
        that feature vector to be either a 0 or 1 respectively
        S : Diagonal matrix calculating from the decomposition of the Hessian

        Returns
        -------
        error : response of predictions compared to actual values
        """
        error = safedot(la.pinv(S), (y - pi))
        return error
Exemple #17
0
    def convert_labels_binary(self, X, w, threshold=0.5):
        """
        Converts values to 0 or 1 based on thresholding of expit function

        Parameters
        ----------
        X : N x D matrix composed of numerical features
        w : Weights determined by chosen solver
        of what class each observation belongs to
        threshold : Some value in the range (0, 1) that decides the decision boundary
        of what class each observation belongs to

        Returns
        -------
        y: bit vector containing 0 or 1 depending on which class each belongs to

        """
        n_pred = expit(safedot(X, w))
        y = self.threshold(n_pred, threshold)
        return y
    def sigmoid(self, x, x_, **kwargs):
        """
        Computes the sigmoid kernel of the two given vectors

        Parameters
        ----------
        x : D x 1 feature vector
        x_ : D x 1 feature vector
        alpha : scaling parameter 
        c = : shifting parameter
        """
        if kwargs:
            try:
                alpha = kwargs['alpha']
                c = kwargs['c']
            except KeyError:
                raise ValueError(
                    'Must use proper arguments for sigmoid kernel function.')
        else:
            alpha = 1e-10
            c = 0.5

        n = alpha * safedot(x, x_) + c
        return np.tanh(n)