def dist2(x, y, x2sum=None): """ Calculate the square of the distances between the vectors in y and in x. Vectors are stored along the rows. x2sum is (optionally) x**2.sum(1) returns D2 D2[i,j] = ((x[i] - y[j])**2).sum() """ if len(x) == x.size: x = utils.columnVector(x) if len(y) == y.size: y = utils.columnVector(y) if x2sum is None: x2sum = (x**2).sum(1)[:, None] else: x2sum = utils.columnVector(x2sum) y2sum = (y**2).sum(1)[:, None] D2 = -2 * np.dot(x, y.T) D2 = (D2.T + y2sum).T D2 += x2sum return D2
def multivariateNormalPDF(covar, mean, regScale=0.0): """ Generates a function f(x) which is a multivariate normal distribution. x is assumed to store the vectors along the rows (ie x is a row vector or a matrix composed of row vectors) """ mean = utils.columnVector(mean).T # Regularize if regScale != 0: ind = np.arange(len(covar)) regularization = abs(covar[ind, ind]).mean() * regScale covar[ind, ind] += regularization # Calculate terms covarInv = np.linalg.inv(covar) det = np.linalg.det(2 * np.pi * covar) norm = 1. / np.sqrt(det) def PDF(x): """ x is assumed to store the vectors along the rows (ie x is a row vector or a matrix composed of row vectors) """ if len(x) > 1: return np.array([PDF(x1[None, :]) for x1 in x]) return float(norm \ * np.exp(-0.5 * np.dot((x-mean), np.dot(covarInv, (x-mean).T)))) return PDF
def pred(self, x): """ Predict the class of x """ probs = self.prob(x) classInd = probs.argmax(1) return utils.columnVector(self.classNames[classInd])
def initFit(self, centroids=None, dataWeights=None): """ Initialize a fit """ if centroids is None: # Initialize centroid guesses np.random.seed(self.seed) ind = np.random.choice(len(self.x), self.nCluster) self.centroids = self.x[ind] else: self.centroids = centroids if dataWeights is None: dataWeights = np.ones([len(self.x), 1]) else: dataWeights = utils.columnVector(dataWeights) self.dataWeights = dataWeights self.error = [] self.oldcentroids = self.centroids.copy() self.centroidMaxChange = [] self._assignCentroid()
def multi2binary(x): """ x should be ints 0, 1, 2, ... nClass-1 -1 in the input is treated as a missing value and is replaced with -1 Returns a numpy array shape (n x nClass) if nClass != 2 For nClass == 2, just return the data unaltered """ x = np.asarray(x) nClass = x.max() + 1 if nClass == 2: return utils.columnVector(x) xbin = np.zeros([len(x), nClass]) for i in range(nClass): mask = (x == i) xbin[mask, i] = 1 mask = (x == -1) xbin[mask, :] = -1 return xbin
def confidence(self, x): """ Confidence is given by the cluster confidence (what fraction of elements in the cluster have the cluster name) """ clusterInd = self.predCluster(x) conf = self.clusterConfidence[clusterInd] return utils.columnVector(conf)
def pred(self, x): """ Predict the class x belongs to (only works if labelClusters has been performed) """ clusterInd = self.predCluster(x) ypred = self.clusterNames[clusterInd] return utils.columnVector(ypred)
def pred(self, x): """ """ # Condition the data, using the same conditioning as for the training # set x, dummy1, dummy2 = utils.condition(x, self._center, self._scale) classInd = np.dot(x, self.w).argmax(1) return utils.columnVector(self.classNames[classInd])
def linearBinaryPred(x, w): """ """ ypred = np.dot(x, w) mask = ypred > 0.5 ypred[mask] = 1 ypred[~mask] = 0 return utils.columnVector(ypred)
def confidence(self, x): """ Confidence is defined as (maximum probability)/(sum of probabilities) for a given data point, which works under the assumption that every data point belongs to one of the classes. """ P = self.prob(x) psum = P.sum(1) pnorm = P.max(1) / P.sum(1) pnorm[psum == 0] = 0 return utils.columnVector(pnorm)
def softmaxProb(x, w): """ P[i, j] is the probability of belonging to class j given x[i], w[:, j] """ N, m = w.shape k = m + 1 E = np.exp(np.dot(x, w)) A = 1. / (1 + E.sum(1)) A = utils.columnVector(A) P = np.dot(A, np.ones([1, k])) P[:, 0:-1] *= E return P
def _xTc(x, y): """ A term used in updating the weights for the softmax gradient descents Parameters ---------- x : array Feature vectors. shape (n x d) y : array Binary labels. shape (n x nClass). y[i,j] = 1 for data[i] belonging to class[j], else it equals 0. """ A = utils.columnVector(1 - y[:, -1]) B = y[:, 0:-1] C = A * B return np.dot(x.T, C)
def pred(self, x): """ Pred class of x """ if (self.arrayBatchSize is not None) and (len(x) > self.arrayBatchSize): # Calculate y_predicted in batches ypred = np.zeros(len(x)) for slicer in utils.arrayBatch(x, self.arrayBatchSize): ypred[slicer] = self.pred(x[slicer]).flatten() else: h = self.featureMap(x) ypred = self.sgd.classifier(h, self.results['w']) return utils.columnVector(ypred)
def linBinSquaredLoss(x, y, w): """ """ ypred = utils.columnVector(np.dot(x, w)) L = ((ypred - y)**2).mean() return L
def fit(self, x, y, regScale=None, threshold=None, dataWeights=None): """ Perform a linear (ridge) regression Parameters ---------- x : array Training features, shape (n, d) y : array Labels. shape (n, 1) regScale : float Amount to scale regularization by Saves ------- w : weights Approximate solution to equation y = x.dot.w """ # Initialize y = utils.columnVector(y) self._xraw = x if regScale is not None: self.regScale = regScale x, center, scale = utils.condition(x) self.x = x self._center = center self._scale = scale self.y = y # Apply weights to data points if dataWeights is not None: dataWeights = utils.columnVector(dataWeights) sqrtWeights = np.sqrt(dataWeights) x = x * sqrtWeights y = y * sqrtWeights self.dataWeights = dataWeights self.regularization = self.regScale / (x**2).mean() # Perform the regression M = np.dot(x.T, x) ind = np.arange(len(M)) M[ind, ind] += self.regularization xTy = np.dot(x.T, y) self.w = np.dot(np.linalg.inv(M), xTy) if np.any(np.isnan(self.w)): raise RuntimeError, "NaN encounter in w in fit" # Get scalar offset if self.scalarOffset: self.w0 = (y - np.dot(x, self.w)).mean() else: self.w0 = 0. if self.classify: # Set up the classifier (using the un-weighted data) self.classifier = LinearBinaryClassifier(self.y, self.ypred(self.x))