def dist2(x, y, x2sum=None):
    """
        Calculate the square of the distances between the vectors in y and in
        x.  Vectors are stored along the rows.
        
        x2sum is (optionally) x**2.sum(1)
        
        returns D2
        
        D2[i,j] = ((x[i] - y[j])**2).sum()
        """
    if len(x) == x.size:

        x = utils.columnVector(x)

    if len(y) == y.size:

        y = utils.columnVector(y)

    if x2sum is None:

        x2sum = (x**2).sum(1)[:, None]

    else:

        x2sum = utils.columnVector(x2sum)

    y2sum = (y**2).sum(1)[:, None]
    D2 = -2 * np.dot(x, y.T)
    D2 = (D2.T + y2sum).T
    D2 += x2sum
    return D2
Esempio n. 2
0
def multivariateNormalPDF(covar, mean, regScale=0.0):
    """
    Generates a function f(x) which is a multivariate normal distribution.
    x is assumed to store the vectors along the rows (ie x is a row vector or
    a matrix composed of row vectors)
    """
    mean = utils.columnVector(mean).T
    # Regularize
    if regScale != 0:
        ind = np.arange(len(covar))
        regularization = abs(covar[ind, ind]).mean() * regScale
        covar[ind, ind] += regularization
    # Calculate terms
    covarInv = np.linalg.inv(covar)
    det = np.linalg.det(2 * np.pi * covar)
    norm = 1. / np.sqrt(det)

    def PDF(x):
        """
        x is assumed to store the vectors along the rows (ie x is a row vector or
        a matrix composed of row vectors)
        """
        if len(x) > 1:

            return np.array([PDF(x1[None, :]) for x1 in x])

        return float(norm \
            * np.exp(-0.5 * np.dot((x-mean), np.dot(covarInv, (x-mean).T))))

    return PDF
Esempio n. 3
0
 def pred(self, x):
     """
     Predict the class of x
     """
     probs = self.prob(x)
     classInd = probs.argmax(1)
     return utils.columnVector(self.classNames[classInd])
    def initFit(self, centroids=None, dataWeights=None):
        """
        Initialize a fit
        """
        if centroids is None:
            # Initialize centroid guesses
            np.random.seed(self.seed)
            ind = np.random.choice(len(self.x), self.nCluster)
            self.centroids = self.x[ind]

        else:
            self.centroids = centroids

        if dataWeights is None:

            dataWeights = np.ones([len(self.x), 1])

        else:

            dataWeights = utils.columnVector(dataWeights)

        self.dataWeights = dataWeights

        self.error = []
        self.oldcentroids = self.centroids.copy()
        self.centroidMaxChange = []
        self._assignCentroid()
Esempio n. 5
0
def multi2binary(x):
    """
    x should be ints 0, 1, 2, ... nClass-1 
    -1 in the input is treated as a missing value and is replaced with -1
    
    Returns a numpy array shape (n x nClass) if nClass != 2
    
    For nClass == 2, just return the data unaltered
    """
    x = np.asarray(x)
    nClass = x.max() + 1

    if nClass == 2:

        return utils.columnVector(x)

    xbin = np.zeros([len(x), nClass])

    for i in range(nClass):

        mask = (x == i)
        xbin[mask, i] = 1

    mask = (x == -1)
    xbin[mask, :] = -1
    return xbin
 def confidence(self, x):
     """
     Confidence is given by the cluster confidence (what fraction of 
     elements in the cluster have the cluster name)
     """
     clusterInd = self.predCluster(x)
     conf = self.clusterConfidence[clusterInd]
     return utils.columnVector(conf)
 def pred(self, x):
     """
     Predict the class x belongs to (only works if labelClusters has 
     been performed)
     """
     clusterInd = self.predCluster(x)
     ypred = self.clusterNames[clusterInd]
     return utils.columnVector(ypred)
 def pred(self, x):
     """
     """
     # Condition the data, using the same conditioning as for the training
     # set
     x, dummy1, dummy2 = utils.condition(x, self._center, self._scale)
     classInd = np.dot(x, self.w).argmax(1)
     return utils.columnVector(self.classNames[classInd])
def linearBinaryPred(x, w):
    """
    """
    ypred = np.dot(x, w)
    mask = ypred > 0.5
    ypred[mask] = 1
    ypred[~mask] = 0

    return utils.columnVector(ypred)
Esempio n. 10
0
 def confidence(self, x):
     """
     Confidence is defined as (maximum probability)/(sum of probabilities)
     for a given data point, which works under the assumption that every 
     data point belongs to one of the classes.
     """
     P = self.prob(x)
     psum = P.sum(1)
     pnorm = P.max(1) / P.sum(1)
     pnorm[psum == 0] = 0
     return utils.columnVector(pnorm)
Esempio n. 11
0
def softmaxProb(x, w):
    """
    P[i, j] is the probability of belonging to class j given x[i], w[:, j]
    """
    N, m = w.shape
    k = m + 1
    E = np.exp(np.dot(x, w))
    A = 1. / (1 + E.sum(1))
    A = utils.columnVector(A)
    P = np.dot(A, np.ones([1, k]))
    P[:, 0:-1] *= E

    return P
Esempio n. 12
0
def _xTc(x, y):
    """
    A term used in updating the weights for the softmax gradient descents
    
    Parameters
    ----------
    x : array
        Feature vectors.  shape (n x d)
    y : array
        Binary labels.  shape (n x nClass).  y[i,j] = 1 for data[i] belonging
        to class[j], else it equals 0.
    """
    A = utils.columnVector(1 - y[:, -1])
    B = y[:, 0:-1]
    C = A * B
    return np.dot(x.T, C)
Esempio n. 13
0
 def pred(self, x):
     """
     Pred class of x
     """
     if (self.arrayBatchSize is not None) and (len(x) > self.arrayBatchSize):
         # Calculate y_predicted in batches
         ypred = np.zeros(len(x))
         for slicer in utils.arrayBatch(x, self.arrayBatchSize):
             
             ypred[slicer] = self.pred(x[slicer]).flatten()
     
     else:
             
         h = self.featureMap(x)
         ypred = self.sgd.classifier(h, self.results['w'])
     
     return utils.columnVector(ypred)
Esempio n. 14
0
def linBinSquaredLoss(x, y, w):
    """
    """
    ypred = utils.columnVector(np.dot(x, w))
    L = ((ypred - y)**2).mean()
    return L
    def fit(self, x, y, regScale=None, threshold=None, dataWeights=None):
        """
        Perform a linear (ridge) regression
        
        Parameters
        ----------
        x : array
            Training features, shape (n, d)
        y : array
            Labels.  shape (n, 1)
        regScale : float
            Amount to scale regularization by
        
        Saves
        -------
        w : weights
            Approximate solution to equation y = x.dot.w
        """
        # Initialize
        y = utils.columnVector(y)
        self._xraw = x
        if regScale is not None:

            self.regScale = regScale

        x, center, scale = utils.condition(x)
        self.x = x
        self._center = center
        self._scale = scale
        self.y = y

        # Apply weights to data points
        if dataWeights is not None:

            dataWeights = utils.columnVector(dataWeights)
            sqrtWeights = np.sqrt(dataWeights)
            x = x * sqrtWeights
            y = y * sqrtWeights

        self.dataWeights = dataWeights
        self.regularization = self.regScale / (x**2).mean()

        # Perform the regression
        M = np.dot(x.T, x)
        ind = np.arange(len(M))
        M[ind, ind] += self.regularization
        xTy = np.dot(x.T, y)
        self.w = np.dot(np.linalg.inv(M), xTy)
        if np.any(np.isnan(self.w)):

            raise RuntimeError, "NaN encounter in w in fit"

        # Get scalar offset
        if self.scalarOffset:
            self.w0 = (y - np.dot(x, self.w)).mean()
        else:
            self.w0 = 0.
        if self.classify:
            # Set up the classifier (using the un-weighted data)
            self.classifier = LinearBinaryClassifier(self.y,
                                                     self.ypred(self.x))