Ejemplo n.º 1
0
    def fade(self, dat, h):
        """
        Fades this dataset into the dataset dat via the mask
        h. The mask is a vector of the same dimension as one data
        sample in self or dat. For each example xi in this dataset and
        the corresponding example yi in the dataset dat, the operation

        :math:`z_{ij} = h_j \cdot x_{ij} + (1-h_j)\cdot y_{ij}`

        is carried out. For that reason h must have entries between
        zero and one.

        :param dat: other dataset which is faded into this one. It must have the same dimension as this dataset.
        :type dat: natter.DataModule.Data
        :param h: mask of the same dimension as a single vector in dat. All entries must be between zero and one.
        :type h: numpy.array
        """

        if dat.size() != self.size():
            raise Errors.DimensionalityError(
                'Dimensionalities of two datasets do not match!')
        if len(h.shape) < 2:
            h = reshape(h, (self.X.shape[0], 1))

        self.X = self.X * h + (1 - h) * dat.X
        self.addToHistory([
            'Faded with dataset %s with history' % (dat.name),
            list(dat.history)
        ])
Ejemplo n.º 2
0
    def size(self, dim=(0, 1)):
        """
        Returns the size of the data matrix *X*. Works just like numpy.size.

        :param dim: Dimension for which the size is to be computed (=0 --> number of examples; =1 --> number of dimensions)
        :type dim: int or tuple of int
        :returns: The requested dimensionality
        :rtype: int or tuple of int
        """
        if not (type(dim) == int) and len(dim) == 2:
            sh = shape(self.X)
            if len(sh) < 2:
                return (1, sh[0])
            else:
                return sh
        elif type(dim) == int:
            sh = shape(self.X)
            if len(sh) < 2:
                if dim == 0:
                    return 1.0
                else:
                    return sh[0]
            else:
                return sh[dim]
        else:
            raise Errors.DimensionalityError(
                'Data matrices cannot have more than two dimensions!')
Ejemplo n.º 3
0
    def scale(self, s):
        """
        Scales *X* with the array *s*. If *s* has the same
        dimensionality as the number of examples, each dimension gets
        scaled with *s*. If *s* has the same dimension as the number
        of dimensions, each example is scaled with *s*. If *s* has the
        same shape as *X*, *X* and *s* are simply multiplied.

        *s* can also be stored in a Data object.

        :param s: The scale factor
        :type s: numpy.array or natter.DataModule.Data
        """
        name = ''
        scaledwhat = ''

        if not (type(s) == ndarray):  # then we assume that s is a data object
            name = s.name
            s = s.X
        else:
            name = 'array'

        sh = s.shape
        if len(sh) == 1 or sh[0] == 1 or sh[1] == 1:
            if sh[0] == self.X.shape[0]:
                s = reshape(s, (self.X.shape[0], 1))
                scaledwhat = 'each dimension'
            elif sh[0] == self.X.shape[1]:
                s = reshape(s, (1, self.X.shape[1]))
                scaledwhat = 'each example'
            elif (sh[0] == 1 and sh[1] != self.X.shape[1]) or (
                    sh[1] == 1 and sh[0] != self.X.shape[0]):
                raise Errors.DimensionalityError(
                    'Dimensionality of s must either be equal to the number of examples or the number of dimensions'
                )
        elif sh[0] != self.X.shape[0] or sh[1] != self.X.shape[1]:
            raise Errors.DimensionalityError('Dimensions of s do not match!')
        else:
            scaledwhat = 'whole data'
        self.history.append('Scaled ' + scaledwhat + ' with ' + name)

        self.X = self.X * s
        return self
Ejemplo n.º 4
0
def MarginalHistogramEqualization(psource, ptarget=None):
    """
    Creates a non-linear filter that changes the marginal distribution
    of each single data dimension independently. For that sake it
    takes two ISA models and performs a histogram equalization on each
    of the marginal distributions.

    *Important*: The ISA models must have one-dimensional subspaces!

    If ptarget is omitted, it will be set to a N(0,I) Gaussian by default.

    :param psource: Source distribution which must be a natter.Distributions.ISA model with one-dimensional subspaces
    :type psource: natter.Distributions.ISA
    :param ptarget: Target distribution which must be a natter.Distributions.ISA model with one-dimensional subspaces
    :type ptarget: natter.Distributions.ISA
    :returns: A non-linear filter that changes for marginal distributions of the data from the respective psource into the respective ptarget
    :rtype: natter.Transforms.NonlinearTransform

    """
    from natter.Distributions import ISA, Gaussian

    if not isinstance(psource, ISA):
        raise TypeError(
            'Transform.TransformFactory.MarginalHistogramEqualization: psource must be an ISA model'
        )
    else:
        psource = psource.copy()

    if not ptarget == None and not isinstance(ptarget, ISA):
        raise TypeError(
            'Transform.TransformFactory.MarginalHistogramEqualization: ptarget must be an ISA model'
        )

    for ss in psource['S']:
        if len(ss) != 1:
            raise Errors.DimensionalityError(
                'Transform.TransformFactory.MarginalHistogramEqualization: psource must have one-dimensional subspaces'
            )

    if ptarget == None:
        ptarget = ISA(S=[(k, ) for k in range(psource['n'])],
                      P=[Gaussian(n=1) for k in range(psource['n'])])
    else:
        ptarget = ptarget.copy()

    g = lambda dat: reduce(lambda x, y: x.stack(y), [
        ptarget['P'][k].ppf(psource['P'][k].cdf(dat[k, :]))
        for k in range(psource['n'])
    ])
    gdet = lambda y: psource.loglik(y) - ptarget.loglik(g(y))

    name = 'Marginal Histogram Equalization Transform: %s --> %s' % (
        psource['P'][0].name, ptarget['P'][0].name)
    return NonlinearTransform(g, name, logdetJ=gdet)
Ejemplo n.º 5
0
    def loglik(self, dat):
        """
        Abstract method which is to be implemented by the
        children. Must return the log-likelihood of the data points in
        dat.

        :param dat: data
        :type dat: natter.DataModule.Data
        :raises: natter.Errors.AbstractError
        """
        raise Errors.AbstractError(
            'Abstract method loglik not implemented in ' + self.name)
Ejemplo n.º 6
0
    def primaryBounds(self):
        """
        Abstract method that should be implemented by the children of
        the Distribution object. Should provide bounds on the primary
        parameters of the distribution object. It should return None,
        if the parameter is unbounded in that direction.

        :returns: bounds on the primary parameters
        :rtype: list of tuples containing the single lower and upper bounds
        """
        raise Errors.AbstractError(
            'Abstract method primaryBounds not implemented in ' + self.name)
Ejemplo n.º 7
0
    def sample(self, m):
        """

        Samples m samples from the current distribution.

        :param m: Number of samples to draw.
        :type m: int.
        :returns:  A Data object containing the samples
        :rtype:    natter.DataModule.Data

        """

        raise Errors.AbstractError(
            'Abstract method sample not implemented in ' + self.name)
Ejemplo n.º 8
0
    def dldx(self, dat):
        '''
        Abstract method that must be implemented by the children of
        Distribution. Should provide the derivative of the log-likelihood w.r.t. the data points in dat. 

        :param dat: Data points at which the derivative will be computed.
        :type dat: natter.DataModule.Data
        :raises: natter.Auxiliary.Errors.AbstractError 
        :returns:  An array containing the derivatives
        :rtype:    numpy.array
           
        '''
        raise Errors.AbstractError('Abstract method dldx not implemented in ' +
                                   self.name)
Ejemplo n.º 9
0
    def cdf(self, dat):
        '''
        Abstract method that must be implemented by the children of
        Distribution. Should provide the cumulative distribution function of the Distribution at the data points in dat.

        :param dat: Data points for which the c.d.f. will be computed.
        :type dat: natter.DataModule.Data
        :raises: natter.Auxiliary.Errors.AbstractError 
        :returns:  An array containing the values of the c.d.f.
        :rtype:    numpy.array
           
        '''
        raise Errors.AbstractError('Abstract method cdf not implemented in ' +
                                   self.name)
Ejemplo n.º 10
0
    def plot(self, ax=None, plottype='scatter', **kwargs):
        """
        Plots a scatter plot of the data points. This method works only for two-dimensional data.

        :param ax: If specified the data is plotted to this axes object.
        :type ax: matplotlib.pyplot.Axis
        :param plottype: plot type; possible choice are 'scatter' or 'loghist' (default 'scatter').
        :type plottype: string
        :param kwargs: arguments passed directly to matplotlib.pyplot.Axis.scatter function
        :type kwargs: dict
        :raises: natter.Auxiliary.Errors.DimensionalityError
        :returns: The axes object.
        :rtype: matplotlib.pyplot.Axis
        """
        if not len(self.X) == 2:
            raise Errors.DimensionalityError(
                'Data object must have dimension 2 for plotting!')
        else:
            if ax is None:
                fig = figure()
                ax = fig.add_axes([.1, .1, .8, .8])

            if plottype is 'scatter':
                ax.scatter(self.X[0], self.X[1], s=.1, **kwargs)
            else:
                ind = ~(any(isnan(self.X), axis=0)
                        | any(isinf(self.X), axis=0))

                mx = amax(abs(self.X[:, ind].ravel()))
                ex = linspace(-mx, mx, self.X.shape[1] / 4000)
                ey = linspace(-mx, mx, self.X.shape[1] / 4000)

                H, ex, ey = histogram2d(self.X[0, :],
                                        self.X[1, :],
                                        bins=(ex, ey))
                ax.contour(.5 * (ex[1:] + ex[:-1]), .5 * (ey[1:] + ey[:-1]),
                           log(H), **kwargs)

                if ('colors' in kwargs) and (type(kwargs['colors'])
                                             == str) and ('label' in kwargs):
                    ra = ax.axis()
                    ax.plot(ra[1] + 1,
                            ra[3] + 1,
                            color=kwargs['colors'],
                            label=kwargs['label'])
                    ax.axis(ra)

            return ax
Ejemplo n.º 11
0
    def __invert__(self):
        """
        Overloads the ~ operator. Returns a new LinearTransform object
        with the inverse of the linear transform matrix W.

        :returns: A new LinearTransform object representing the inverted matrix W.
        :rtype: natter.Transforms.LinearTransform
        """
        sh = shape(self.W)
        if sh[0] == sh[1]:
            tmp = list(self.history)
            tmp.append('inverted')
            return LinearTransform(inv(self.W), self.name, tmp)
        else:
            raise Errors.DimensionalityError(
                'Transform.__invert__(): Transform must be square!')
Ejemplo n.º 12
0
    def histogram(self, dat, cdf=False, ax=None, plotlegend=True, bins=None):
        """
        Plots a histogram of the data points in dat. This works only
        for 1-dimensional distributions. It also plots the pdf of the distribution.

        :param dat: data points that enter the histogram
        :type dat: natter.DataModule.Data
        :param cdf: boolean that indicates whether the cdf should be plotted or not (default: False)
        :param ax: axes object the histogram is plotted into if it is not None.
        :param plotlegend: boolean indicating whether a legend should be plotted (default: True)
        :param bins: number of bins to be used. If None (default), the bins are automatically determined.
        """

        sh = shape(dat.X)
        if len(sh) > 1 and sh[0] > 1:
            raise Errors.DimensionalityError(
                'Cannont plot data with more than one dimension!')

        if ax == None:
            fig = plt.figure()
            ax = fig.add_axes([.1, .1, .8, .8])
        x = squeeze(dat.X)
        if bins is None:
            bins = max(sh) / 200
        n, bins, patches = ax.hist(x,
                                   bins=bins,
                                   normed=1,
                                   facecolor='blue',
                                   alpha=0.8,
                                   lw=0.0)

        bincenters = 0.5 * (bins[1:] + bins[:-1])
        y = squeeze(self.pdf(Data(bincenters)))
        ax.plot(bincenters, y, 'k--', linewidth=2)

        if hasattr(self, 'cdf') and cdf:
            z = squeeze(self.cdf(Data(bincenters)))
            ax.plot(bincenters, z, 'k.-', linewidth=2)
            if plotlegend:
                plt.legend(('p.d.f.', 'c.d.f.', 'Histogram'), frameon=False)
        elif plotlegend:
            plt.legend(('p.d.f.', 'Histogram'), frameon=False)

        ax.set_xlabel('x')
        ax.set_ylabel('Probability')
        ax.set_xlim(min(x), max(x))
        ax.grid(True)
Ejemplo n.º 13
0
    def pdf(self, dat):
        '''

        Evaluates the probability density function on the data points
        in dat by calling the function loglik.

        :param dat: Data points for which the p.d.f. will be computed.
        :type dat: natter.DataModule.Data
        :raises: natter.Auxiliary.Errors.AbstractError if loglik of that distribution is not implemented.
        :returns:  An array containing the values of the density.
        :rtype:    numpy.array
           
        '''
        if hasattr(self, 'loglik'):
            return exp(self.loglik(dat))
        raise Errors.AbstractError('Abstract method pdf not implemented in ' +
                                   self.name)
Ejemplo n.º 14
0
    def logDetJacobian(self, dat):
        """
        Computes the determinant of the logarithm of the Jacobians
        determinant for the nonliner transformation at each data point
        in *dat*.


        :param dat: Data for which the log-det-Jacobian is to be computed.
        :type dat: natter.DataModule.Data
        :returns: The log-det-Jacobian 
        :rtype: numpy.array
        """

        if self.logdetJ == None:
            raise Errors.AbstractError('logdetJ has not been specified!')
        else:
            return self.logdetJ(dat)
Ejemplo n.º 15
0
    def score(self, param, dat, compute_derivative=False):
        """
        score(param,dat, compute_derivative=False)

        must exhibit the following behaviour:

        1) if compute_derivative ==False, it returns the value of the score function at param and dat
        2) if compute_derivative ==True, it returns the derivative w.r.t the primary parameters of the score function at param and dat

        :param param: parameters at which the score is computed
        :type param: numpy.ndarray
        :param dat: data points at which the score is computed
        :type dat: natter.DataModule.Data
        :param compute_derivative: See above
        :returns: the score for score matching
        """
        raise Errors.AbstractError(
            'Abstract method score not implemented in ' + self.name)
Ejemplo n.º 16
0
    def stack(self, dat):
        """
        Stacks the current dataset with a copy of the dataset dat. Both must
        have the same number of examples.

        :param dat: Other data object with the same number of examples
        :type dat: natter.DataModule.Data
        """

        if dat.numex() != self.numex():
            raise Errors.DimensionalityError(
                'Number of examples of two datasets do not match!')

        self.X = vstack((self.X, dat.copy().X))

        self.addToHistory([
            'Stacked with dataset %s with history' % (dat.name),
            list(dat.history)
        ])

        return self
Ejemplo n.º 17
0
def SSA2D(linearfilter=None, data=None, *args, **kwargs):
    """
    Creates a nonlinear filter either from the given linear SSA filter
    or learns the linear filter on given data set using the
    LinearTransformFactory.SSA() method. The SSA2D filter computes the
    sum of the squared responses of 1st and 2nd, 3rd and 4th, ...
    component thus returns n/2 dimensions.

    :param linearfilter: the linear filter stage of the nonlinear filter
    :type linearfilter: natter.Transforms.LinearTransform
    :param data: Alternatively data on which the linear filter is learned
    :type data: natter.DataModule.Data

    :returns: bib-linear transform
    :rtype: natter.Transforms.NonlinearTransform

    """
    if linearfilter is None and not data is None:
        U = SSA(data, *args, **kwargs)
    elif not linearfilter is None:
        U = linearfilter
    else:
        raise ValueError(
            'in NonlinearTransformFactory.SSA2D both linearfilter and data cannot be None'
        )

    if mod(U.W.shape[0], 2) == 1:
        raise Errors.DimensionalityError(
            'Transform must have even dimension number')

    g = ElementWise(lambda x: x**2)
    g.name = 'Elementwise squaring'
    M = LinearTransform(eye(U.W.shape[0]).reshape(U.W.shape[0] // 2, 2,
                                                  U.W.shape[0]).sum(1),
                        name='Summing over 2D subspaces')
    nonlinearfilter = M * g * U
    nonlinearfilter.name = '2D SSA filter'

    return nonlinearfilter
Ejemplo n.º 18
0
    def logDetJacobian(self, dat=None):
        """
        Computes the determinant of the logarithm of the Jacobians
        determinant for the linear transformation (which is in this
        case only the log-determinant of W). If *dat* is specified it
        returns as many copies of the log determinant as there are
        data points in *dat*.

        :param dat: Data for which the log-det-Jacobian is to be computed.
        :type dat: natter.DataModule.Data
        :returns: The log-det-Jacobian
        :rtype: float (if dat=None) or numpy.array (if dat!=None)

        """

        sh = shape(self.W)
        if sh[0] == sh[1]:
            if dat == None:
                return log(abs(det(self.W)))
            else:
                return array(dat.size(1) * [log(abs(det(self.W)))])
        else:
            raise Errors.DimensionalityError(
                'Can only compute log det of square filter matrix')
Ejemplo n.º 19
0
def libsvm(path, n=1):
    """
    Loads data from a file in libsvm sparse format. The dimensionality
    of the data must be specified in advance.

    :param path: Path to the libsvm file.
    :type path: string
    :param n: Dimensionality of the data.
    :type n: int
    :returns: Data object with the data from the specified file.
    :rtype: natter.DataModule.Data

    """
    f = open(path, 'r')
    L = f.readlines()
    f.close()
    m = len(L)
    X = zeros((m, n))
    i = 0
    for l in L:
        l = [e.split(':') for e in l.rstrip().lstrip().split()[1:]]
        ind = [int(e[0]) - 1 for e in l]
        val = [float(e[1]) for e in l]

        if any(ind < 0):
            raise Errors.DataLoadingError('Index negative!')
        if max(ind) + 1 > n:
            X = concatenate((X, zeros((m, max(ind) + 1 - size(X, 1)))), 1)
            n = max(ind) + 1 - size(X, 1)
            print ind
        X[i, ind] = val
        i += 1
    dat = Data(X.transpose(), 'Data from ' + path)
    dat.history.append('loaded from ' + path)
    dat.history.append('converted from libsvm format')
    return dat
Ejemplo n.º 20
0
def dirichlet_fit_s(dat, a, bar_p=None, maxiter=100, tol=1e-6):
    """

        DIRICHLET_FIT_S   Maximum-likelihood Dirichlet precision.


        A is decomposed into S*M, where M is a vector such that sum(M)=1,
        and only S is changed by this function.  In other words, A/sum(A)
        is unchanged by this function.

        The algorithm is a generalized Newton iteration, described in
        \"Estimating a Dirichlet distribution\" by T. Minka.

        Written for Matlab by Tom Minka, ported to Python by Fabian Sinz

        :param dat: data points for the fit
        :type dat: natter.DataModule.Data
        :param a:  vector providing the initial guess for the parameters.
        :type a: numpy.ndarray
        :param bar_p: mean log of the data points
        :type bar_p: numpy.ndarray
        :param maxiter: maximum number of iterations
        :type maxiter: int
        :param tol: convergence tolerance
        :type tol: float
        :returns: estimate of the parameter array
        :rtype: numpy.ndarray
        """

    s = sum(a)
    m = a / s

    # sufficient statistics
    if bar_p == None:
        bar_p = mean(log(dat.X), 1)

    bar_p = sum(m * bar_p)

    for i in xrange(maxiter):
        old_s = s
        g = digamma(s) - sum(m * digamma(s * m)) + bar_p
        h = trigamma(s) - sum((m**2) * trigamma(s * m))

        success = False
        if not success and (g + s * h) < 0:
            s = 1 / (1 / s + g / h / s**2.0)
            if s > 0:
                success = True
            else:
                s = old_s

        if not success:
            # Newton on log(s)
            s = s * exp(-g / (s * h + g))
            if s > 0:
                success = True
            else:
                s = old_s

        if not success:
            # Newton on 1/s
            s = 1 / (1 / s + g / (s**2.0 * h + 2.0 * s * g))
            if s > 0:
                success = True
            else:
                s = old_s

        if not success:
            # Newton
            s = s - g / h
            if s > 0:
                success = True
            else:
                s = old_s

        if not success:
            raise Errors.UpdateError("All updates failed")
        a = s * m

        if max(abs(s - old_s)) < tol:
            break
    return a
Ejemplo n.º 21
0
def fminboundnD(f, x0, LB, UB, tol=1e-3, *args):
    """

    Multidimensional gradient free optimization with box constraints on the variables.


    I ported this function from a Matlab function someone else posted
    in the internet. Unfortunately, I cannot find the source anymore.
    Who was it and reference him. If you were the author, please
    contact me and you get acknowledged ([email protected]).

    :param f: function to be minimized (takes a vector x and args)
    :type f: python function
    :param x0: starting value
    :type x0: numpy.ndarray
    :param LB: lower bounds
    :type LB: numpy.ndarray
    :param UB: upper bounds
    :type UB: numpy.ndarray
    :param tol: convergence tolerance
    :type tol: float
    :param args:  additional parameters for the function f
    :type args: list

    :returns: optimized x
    :rtype: numpy.ndarray
    """
    xsize = shape(x0)
    x0 = x0.flatten()
    n = len(x0)

    if (n != len(LB)) or (n != len(UB)):
        raise Errors.DimensionalityError(
            'x0 is incompatible in size with either LB or UB.')

    # 0 --> unconstrained variable
    # 1 --> lower bound only
    # 2 --> upper bound only
    # 3 --> dual finite bounds
    # 4 --> fixed variable
    BoundClass = [0] * n

    for i in xrange(n):
        k = isfinite(LB[i]) + 2 * isfinite(UB[i])
        BoundClass[i] = k
        if (k == 3) and (LB[i] == UB[i]):
            BoundClass[i] = 4

    # transform starting values into their unconstrained
    # surrogates. Check for infeasible starting guesses.
    x0u = x0.copy()
    k = 0

    for i in xrange(n):
        if BoundClass[i] == 1:
            # lower bound only
            if x0[i] <= LB[i]:
                # infeasible starting value. Use bound.
                x0u[k] = 0.0
            else:
                x0u[k] = abs(x0[i] - LB[i])
            k += 1
        elif BoundClass[i] == 2:
            # upper bound only
            if x0[i] >= UB[i]:
                # infeasible starting value. use bound.
                x0u[k] = 0.0
            else:
                x0u[k] = abs(UB[i] - x0[i])
            k += 1
        elif BoundClass[i] == 3:
            # lower and upper bounds
            if x0[i] <= LB[i]:
                # infeasible starting value
                x0u[k] = -pi / 2.0
            elif x0[i] >= UB[i]:
                # infeasible starting value
                x0u[k] = pi / 2
            else:
                x0u[k] = 2 * (x0[i] - LB[i]) / (UB[i] - LB[i]) - 1.0
                x0u[k] = 2.0 * pi + arcsin(
                    max(array([-1.0, min(array([1.0, x0u[k]]))])))
            k += 1
        elif BoundClass[i] == 0:
            x0u[k] = x0[i]
            k += 1

    if k <= n:
        x0u = x0u[:k]

    # were all the variables fixed?
    if len(x0u) == 0:
        # All variables were fixed. quit immediately, setting the
        # appropriate parameters, then return.

        # undo the variable transformations into the original space
        x = _xtransform(x0u, LB, UB, BoundClass, n)

        # final reshape
        x = reshape(x, xsize)
        return x

    # now we can call fmin
    f2 = lambda t: f(_xtransform(t, LB, UB, BoundClass, n), *args)

    xu = fmin(f2, x0u, xtol=tol, *args)
    # undo the variable transformations into the original space
    x = _xtransform(xu, LB, UB, BoundClass, n)

    # final reshape
    return reshape(x, xsize)