def set_activation(self, method, sig=None, d_sig=None, sig_0=None, d_sig_0=None): """ This method sets the activation functions. Parameters ---------- method : str 'logistic' , 'htanget', or 'custom' sig, d_sig, sig_0, and d_sig_0 : function objects Optional arguments intended for use with the 'custom' method option. They should be functions. sig_0 and d_sig_0 are the output layer activation functions. """ method = method.lower() if method == 'logistic': self.sig = lambda z: twod(1 / (1 + np.exp(-z))) self.d_sig = lambda z: twod(np.multiply(self.sig(z), (1 - self.sig(z)))) self.sig_0 = self.sig self.d_sig_0 = self.d_sig elif method == 'htangent': self.sig = lambda z: twod(np.tanh(z)) self.d_sig = lambda z: twod(1 - np.power(np.tanh(z), 2)) self.sig_0 = self.sig self.d_sig_0 = self.d_sig elif method == 'custom': self.sig = sig self.d_sig = d_sig self.sig_0 = sig_0 self.d_sig_0 = d_sig_0 else: raise ValueError('NNetClassify.set_activation: ' + str(method) + ' is not a valid option for method') self.activation = method
def train(self, base, n, X, Y, *args, **kargs): """ Learn n new instances of base class. Refer to constructor docstring for descriptions of arguments. """ self.base = base N,D = twod(X).shape n_init = self.n_use step = 1 if n_init == 0: # skip training, use constant predictor; set to local var self.const = np.mean(Y) # (specialized to quadratic loss) y_hat = np.zeros(N) + self.const # figure out current prediction value for i in range(n_init): # if we already have learners... yi = self[i].predict(X).flatten() # ...figure out prediction for the y_hat += (self.alpha[i] * yi) # training data for i in range(n_init, n_init + n): Ri = (Y - y_hat) + 1e-64 # compute residuals (specialized to quadratic loss) self.ensemble.append(base(X, Ri, *args, **kargs)) # fit a model to the gradient residual yi = self[-1].predict(X) # minimize loss over alpha (specialized to quadratic loss) min_loss = step * np.divide((Ri.dot(yi)), (twod(yi).T.dot(yi))) self.alpha.append(min_loss.flatten()[0]) y_hat = (twod(y_hat).T + self.alpha[-1] * yi).flatten() self.n_use += 1
def fsvd(X, K, T=None): """ Reduce the dimensionality of X to K features using singular value decomposition. Parameters ---------- X : numpy array M x N array of data. K : int Number of desired output features. T : numpy array (optional) Transform matrix. Including T will use T instead of computing the SVD. Returns ------- Xsvd : numpy array N x K matrix of data. T : numpy array (optional) Transform matrix """ n, m = twod(X).shape if type(T) is type(None): U, S, V = np.linalg.svd(X, full_matrices=False) # compute SVD (Ihler uses svds here) U = U[:, :K] S = np.diag(S[:K]) V = V.T[:, :K] Xsvd = U.dot(np.sqrt(S)) # new data coefficients T = np.sqrt(S[0:K, 0:K]).dot(twod(V).T) # new bases for data return (Xsvd, T) Xsvd = np.divide(X, T) # or, use given set of bases return Xsvd, T
def setActivation(self, method, sig=None, sig0=None): """ This method sets the activation functions. Parameters ---------- method : string, {'logistic' , 'htangent', 'custom'} -- which activation type Optional arguments for "custom" activation: sig : function object F(z) returns activation function & its derivative at z (as a tuple) sig0: activation function object F(z) for final layer of the nnet """ raise NotImplementedError # unfinished / tested method = method.lower() if method == 'logistic': self.sig = lambda z: twod(1 / (1 + np.exp(-z))) self.d_sig = lambda z: twod(np.multiply(self.sig(z), (1 - self.sig(z)))) self.sig_0 = self.sig self.d_sig_0 = self.d_sig elif method == 'htangent': self.sig = lambda z: twod(np.tanh(z)) self.d_sig = lambda z: twod(1 - np.power(np.tanh(z), 2)) self.sig_0 = self.sig self.d_sig_0 = self.d_sig elif method == 'custom': self.sig = sig self.d_sig = d_sig self.sig_0 = sig_0 self.d_sig_0 = d_sig_0 else: raise ValueError('nnetRegress.set_activation: ' + str(method) + ' is not a valid option for method') self.activation = method
def fkitchensink(X, K, typ, W=None): """ Random kitchen sink features from data. Selects K random "kitchen sink" features of X. Parameters ---------- X : numpy array M x N numpy array containing data. K : int Number of features to select. typ : str One of: 'stump', 'sigmoid', 'sinuoid', or 'linear'. W : numpy array (optional) N x K array of parameters. If provided, W uses fixed params. Returns ------- Z : numpy array M x K array of features selected from X. W : numpy array (optional) N x K array of random parameters. Only returned if the argument W isn't provided. """ to_return = () N, M = twod(X).shape typ = typ.lower() if type(W) is type(None): # numpy complains about truth value of arrays if typ == "stump": W = np.zeros((2, K)) s = np.sqrt(np.var(X, axis=0)) # random feature index 1..M W[0, :] = np.floor(np.random.rand(K) * M) W = W.astype(int) W[0, :] = W[0, :].astype(int) W[1, :] = np.random.randn(K) * s[W[0, :]] # random threshold (w/ same variance as that feature) elif typ in ["sigmoid", "sinusoid", "linear"]: # random direction for sigmodal ridge, random freq for sinusoids, random linear projections W = np.random.randn(M, K) to_return = (W,) Z = np.zeros((N, K)) if typ == "stump": # decision stump w/ random threshold for i in range(K): Z[:, i] = X[:, W[0, i]] >= W[1, i] elif typ == "sigmoid": # sigmoidal ridge w/ random direction Z = twod(X).dot(W) Z = 1 / (1 + np.exp(Z)) elif typ == "sinusoid": # sinusoid w/ random frequency Z = np.sin(twod(X).dot(W)) elif typ == "linear": # straight linear projection Z = twod(X).dot(W) return Z if len(to_return) == 0 else (Z,) + to_return
def log_likelihood(self, X, Y): """ Compute the emperical avg. log likelihood of 'obj' on test data (X,Y). See constructor doc string for argument descriptions. """ r,c = twod(Y).shape if r == 1 and c != 1: Y = twod(Y).T soft = self.predict_soft(X) return np.mean(np.sum(np.log(np.power(soft, Y, )), 1), 0)
def __logistic(self, X): """ This is a helper method that evaluates the logistic function for weights self.wts (1 x d + 1) on data X (n x d). Used in: __gradient_descent predict """ n,d = twod(X).shape X_train = cols((np.ones((n,1)), twod(X))) f = twod(X_train).dot(twod(self.wts).T) return 1 / (1 + np.exp(-f))
def plot_classify_2D(learner, X, Y, pre=lambda x: x): """ Plot data and classifier outputs on two-dimensional data. This function plot data (X,Y) and learner.predict(X, Y) together. The learner is is predicted on a dense grid covering data X, to show its decision boundary. Parameters ---------- learner : learner object A trained learner object that inherits from one of the 'Classify' or 'Regressor' base classes. X : numpy array N x M array of data; N = number of data, M = dimension (number of features) of data. Y : numpy array 1 x N array containing labels corresponding to data points in X. pre : function object (optional) Function that is applied to X before prediction. """ if twod(X).shape[1] != 2: raise ValueError('plot_classify_2d: function can only be called using two-dimensional data (features)') plt.plot(X[:,0], X[:,1], 'k.') ax = plt.xlim() + plt.ylim() # get current axis limits N = 256 # density of evaluation # evaluate each point of feature space and predict the class X1 = np.linspace(ax[0], ax[1], N) X1sp = twod(X1).T * np.ones(N) X2 = np.linspace(ax[2], ax[3], N) X2sp = np.ones((N,1)) * X2 Xfeat = cols((twod(X1sp.flatten()).T, twod(X2sp.flatten()).T)) # preprocess/create feature vector if necessary Xfeat = pre(Xfeat) # predict using learner pred = learner.predict(Xfeat) # plot decision values for space in 'faded' color clim = np.unique(Y) clim = [clim[0], clim[0] + 1] if len(clim) == 1 else list(clim) plt.imshow(np.reshape(pred, (N,N)).T, extent=[X1[0], X1[-1], X2[0], X2[-1]], cmap=plt.cm.Pastel2) plt.clim(*clim) plt.show()
def flda(X, Y, K, T=None): """ Reduce the dimension of X to K features using (multiclass) discriminant analysis. Parameters ---------- X : numpy array M x N array of data. Y : numpy array M x 1 array of labels corresponding to data in X. K : int New dimension (number of features) of X. T : numpy array (optional) The transform matrix. If this argument is provided, function uses T instead of computing the LDA. Returns ------- Xlda : numpy array T : numpy array (optional) TODO: Test; check/test Matlab version? """ if type(T) is not type(None): return np.divide(X, T) n, m = twod(X).shape c = np.unique(Y) nc = np.zeros(len(c)) mu = np.zeros((len(c), n)) sig = np.zeros((len(c), n, n)) for i in range(len(c)): idx = np.where(Y == c[i])[0] nc[i] = len(idx) mu[i, :] = np.mean(X[:, idx], axis=0) sig[i, :, :] = np.cov(X[:, idx]) S = (nc / n).dot(np.reshape(sig, (len(c), n * n))) S = np.reshape(S, (n, n)) U, S, V = np.linalg.svd(X, K) # compute SVD (Ihler uses svds here) Xlda = U.dot(np.sqrt(S)) # new data coefficients T = np.sqrt(S[0:K, 0:K]).dot(twod(V).T) # new bases for data return Xlda, T
def bootstrapData(X, Y=None, n_boot=None): """ Bootstrap resample a data set (with replacement): draw data points (x_i,y_i) from (X,Y) n_boot times. Parameters ---------- X : MxN numpy array of data points to be resampled. Y : Mx1 numpy array of labels associated with each datum (optional) n_boot : int, number of samples to draw (default: N) Returns ------- Xboot, Yboot : (tuple of) numpy arrays for the resampled data set If Y is not present or None, returns only Xboot (non-tuple) """ nx,dx = twod(X).shape Y = Y.flatten() if n_boot is None: n_boot = nx idx = np.floor(np.random.rand(n_boot) * nx).astype(int) X = X[idx,:] ny = len(Y) assert ny > 0, 'bootstrapData: Y must contain data' assert nx == ny, 'bootstrapData: X and Y should have the same length' Y = Y[idx] return (X,Y)
def shuffle_data(X, Y): """ Shuffle data in X and Y. Parameters ---------- X : numpy array N x M array of data to shuffle. Y : numpy arra 1 x N array of labels that correspond to data in X. Returns ------- X or (X,Y) : numpy array or tuple of arrays Shuffled data (only returns X and Y if Y contains data). TODO: test more """ nx,dx = twod(X).shape Y = arr(Y).flatten() ny = len(Y) pi = np.random.permutation(nx) X = X[pi,:] if ny > 0: assert ny == nx, 'shuffle_data: X and Y must have the same length' Y = Y[pi] return X,Y return X
def bootstrap_data(X, Y, n_boot): """ Function that resamples (bootstrap) data set: it resamples data points (x_i,y_i) with replacement n_boot times. Parameters ---------- X : numpy array N x M numpy array that contains data points to be sampled. Y : numpy array 1 x N numpy arra that contains labels that map to data points in X. n_boot : int The number of samples to take. Returns ------- (array,array) Tuple containing samples from X and Y. TODO: test more """ Y = Y.flatten() nx,dx = twod(X).shape idx = np.floor(np.random.rand(n_boot) * nx).astype(int) X = X[idx,:] ny = len(Y) assert ny > 0, 'bootstrap_data: Y must contain data' assert nx == ny, 'bootstrap_data: X and Y should have the same length' Y = Y[idx] return (X,Y)
def splitData(X, Y=None, train_fraction=0.80): """ Split data into training and test data. Parameters ---------- X : MxN numpy array of data to split Y : Mx1 numpy array of associated target values train_fraction : float, fraction of data used for training (default 80%) Returns ------- to_return : (Xtr,Xte,Ytr,Yte) or (Xtr,Xte) A tuple containing the following arrays (in order): training data from X, testing data from X, training labels from Y (if Y contains data), and testing labels from Y (if Y contains data). """ nx,dx = twod(X).shape ne = round(train_fraction * nx) Xtr,Xte = X[:ne,:], X[ne:,:] to_return = (Xtr,Xte) if Y is not None: Y = arr(Y).flatten() ny = len(Y) if ny > 0: assert ny == nx, 'splitData: X and Y must have the same length' Ytr,Yte = Y[:ne], Y[ne:] to_return += (Ytr,Yte) return to_return
def fsubset(X, K, feat=None): """ Select subset of features from data. Selects a fixed or random subset of K features from X. Parameters ---------- X : numpy array M x N array of data. K : int Number of features in output. feat : array like (optional) Flat array of indices specifying which features to select. Returns ------- X_sub : numpy array M x K numpy array of data. feat : numpy array (optional) 1 x N array of indices of selected features. Only returned if feat argument isn't provided. """ n, m = twod(X).shape to_return = () if type(feat) is type(None): feat = np.random.permutation(m) feat = feat[0:K] to_return = (feat,) X_sub = X[:, feat] return X_sub if len(to_return) == 0 else (X_sub,) + to_return
def shuffleData(X, Y=None): """ Shuffle (randomly reorder) data in X and Y. Parameters ---------- X : MxN numpy array: N feature values for each of M data points Y : Mx1 numpy array (optional): target values associated with each data point Returns ------- X,Y : (tuple of) numpy arrays of shuffled features and targets only returns X (not a tuple) if Y is not present or None Ex: X2 = shuffleData(X) : shuffles the rows of the data matrix X X2,Y2 = shuffleData(X,Y) : shuffles rows of X,Y, preserving correspondence """ nx,dx = twod(X).shape Y = arr(Y).flatten() ny = len(Y) pi = np.random.permutation(nx) X = X[pi,:] if ny > 0: assert ny == nx, 'shuffleData: X and Y must have the same length' Y = Y[pi] if Y.ndim <= 1 else Y[pi,:] return X,Y return X
def fproject(X, K, proj=None): """ Random projection of features from data. Selects a fixed or random linear projection of K features from X. Parameters ---------- X : numpy array M x N array of data. K : int Number of features in output. proj : numpy array (optional) The projection matrix. If this argument is provided, function uses proj instead of random matrix. Returns ------- X : numpy array M x K array of projecjtion of data in X. proj : numpy array (optional) N x K numpy array that is the project matrix. Only returned if proj argument isn't provided. """ n, m = twod(X).shape to_return = () if type(proj) is type(None): proj = np.random.randn(m, K) to_return = (proj,) X2 = X.dot(proj) return X2 if len(to_return) == 0 else (X2,) + to_return
def fpoly_mono(X, degree, bias=True): """ Create polynomial features of each individual feature (no cross products). Parameters ---------- X : MxN numpy array of data (each row one data point) degree : int, the polynomial degree bias : bool, include constant feature if true (default) Returns ------- Xext : MxN' numpy array with each data point's higher order features """ m, n = twod(X).shape if bias: Xext = np.zeros((m, n * degree + 1)) Xext[:, 0] = 1 k = 1 else: Xext = np.zeros((m, n * degree)) k = 0 for p in range(degree): for j in range(n): Xext[:, k] = np.power(X[:, j], p + 1) k += 1 return Xext
def kmeans(X, K, init='random', max_iter=100, do_plot=False, to_return=[1, 0, 0]): """ Perform K-means clustering on data X. Parameters ---------- X : numpy array N x M array containing data to be clustered. K : int Number of clusters. init : str or array (optional) Either a K x N numpy array containing initial clusters, or one of the following strings that specifies a cluster init method: 'random' (K random data points (uniformly) as clusters), 'farthest' (choose cluster 1 uniformly, then the point farthest from all cluster so far, etc.), or 'k++' (choose cluster 1 uniformly, then points randomly proportional to distance from current clusters). max_iter : int (optional) Maximum number of optimization iterations. do_plot : bool (optional) Plot 2D data? to_return : [bool] (optional) Array of bools that specifies which values to return. The bool at to_return[0] indicates whether z should be returned; the bool at to_return[1] indicates whether c should be returned, etc. Returns ------- z : numpy array N x 1 array containing cluster numbers of data at indices in X. c : numpy array (optional) K x M array of cluster centers. sumd : scalar (optional) Sum of squared euclidean distances. TODO: test more """ n,d = twod(X).shape # get data size if type(init) is str: init = init.lower() if init == 'random': pi = np.random.permutation(n) c = X[pi[0:K],:] elif init == 'farthest': c = k_init(X, K, True) elif init == 'k++': c = k_init(X, K, False) else: raise ValueError('kmeans: value for "init" ( ' + init + ') is invalid') else: c = init z,c,sumd = __optimize(X, n, K, c, max_iter) return optional_return(to_return, z - 1, c, sumd)
def checkDataShape(X,Y): """ Simple helper function to convert vectors to matrices and check the shape of the data matrices X,Y """ X = twod(X).T if X.ndim < 2 else X #Y = twod(Y).T if Y.ndim < 2 else Y if X.shape[0] != Y.shape[0]: raise ValueError("X and Y do not have the same number of data points!") return X,Y
def data_gauss(N0, N1=None, mu0=arr([0, 0]), mu1=arr([1, 1]), sig0=np.eye(2), sig1=np.eye(2)): """ Sample data from a Gaussian model. Parameters ---------- N0 : int Number of data to sample for class -1. N1 : int Number of data to sample for class 1. mu0 : numpy array mu1 : numpy array sig0 : numpy array sig1 : numpy array Returns ------- X : numpy array Array of sampled data. Y : numpy array Array of class values that correspond to the data points in X. TODO: test more """ if not N1: N1 = N0 d1,d2 = twod(mu0).shape[1],twod(mu1).shape[1] if d1 != d2 or np.any(twod(sig0).shape != arr([d1, d1])) or np.any(twod(sig1).shape != arr([d1, d1])): raise ValueError('data_gauss: dimensions should agree') X0 = np.dot(np.random.randn(N0, d1), sqrtm(sig0)) X0 += np.ones((N0,1)) * mu0 Y0 = -np.ones(N0) X1 = np.dot(np.random.randn(N1, d1), sqrtm(sig1)) X1 += np.ones((N1,1)) * mu1 Y1 = np.ones(N1) X = np.row_stack((X0,X1)) Y = np.concatenate((Y0,Y1)) return X,Y
def plotGauss2D(mu,cov,*args,**kwargs): """ Plot an ellipsoid indicating (one std deviation of) a 2D Gaussian distribution All additional arguments are passed into plot(.) """ from scipy.linalg import sqrtm theta = np.linspace(0,2*np.pi,50) circle = np.array([np.sin(theta),np.cos(theta)]) ell = sqrtm(cov).dot(circle) ell += twod(mu).T plt.plot( mu[0],mu[1], 'x', ell[0,:],ell[1,:], **kwargs)
def predict(self, X): """ Predict on X. Refer to constructor docstring for description of X. """ N,D = twod(X).shape Y_te = np.zeros((N,1)) + self.const for i,l in enumerate(self): yi = l.predict(X) # figure out current prediction value # if we already have learners, figure out the prediction on the training data Y_te = Y_te + self.alpha[i] * yi return Y_te
def roc(self, X, Y): """ This method computes the "receiver operating characteristic" curve on test data. This method is only defined for binary classifiers. Refer to the auc doc string for descriptions of X and Y. Method returns (fpr, tpr, tnr), where fpr = false positive rate (1xN numpy vector) tpr = true positive rate (1xN numpy vector) tnr = true negative rate (1xN numpy vector) Plot fpr vs. tpr to see the ROC curve. Plot tpr vs. tnr to see the sensitivity/specificity curve. """ if len(self.classes) > 2: raise ValueError('This method can only supports binary classification ') try: # compute 'response' (soft binary classification score) soft = self.predictSoft(X)[:,1] # p(class = 2nd) except (AttributeError, IndexError): soft = self.predict(X) # or we can use 'hard' binary prediction if soft is unavailable n,d = twod(soft).shape if n == 1: soft = soft.flatten() else: soft = soft.T.flatten() # number of true negatives and positives n0 = np.sum(Y == self.classes[0]) n1 = np.sum(Y == self.classes[1]) if n0 == 0 or n1 == 0: raise ValueError('Data of both class values not found') # sort data by score value sorted_soft = np.sort(soft) indices = np.argsort(soft) Y = Y[indices] # compute false positives and true positive rates tpr = np.divide(np.cumsum(Y[::-1] == self.classes[1]), n1) fpr = np.divide(np.cumsum(Y[::-1] == self.classes[0]), n0) tnr = np.divide(np.cumsum(Y == self.classes[0]), n0)[::-1] # find ties in the sorting score same = np.append(np.asarray(sorted_soft[0:-1] == sorted_soft[1:]), 0) tpr = np.append([0], tpr[np.logical_not(same)]) fpr = np.append([0], fpr[np.logical_not(same)]) tnr = np.append([1], tnr[np.logical_not(same)]) return [tpr, fpr, tnr]
def plotClassify2D(learner, X, Y, pre=lambda x: x, axis=None, nGrid=128, **kwargs): """ Plot data and classifier outputs on two-dimensional data. This function plot data (X,Y) and learner.predict(X, Y) together. The learner is is predicted on a dense grid covering data X, to show its decision boundary. Parameters ---------- learner : learner object A trained learner object that inherits from one of the 'Classify' or 'Regressor' base classes. X : numpy array N x M array of data; N = number of data, M = dimension (number of features) of data. Y : numpy array 1 x N arra containing labels corresponding to data points in X. pre : function object (optional) Function that is applied to X before prediction. axis : a matplotlib axis / plottable object (optional) nGrid : density of 2D grid points (default 128) """ if twod(X).shape[1] != 2: raise ValueError('plotClassify2D: function can only be called using two-dimensional data (features)') # TODO: Clean up code if axis == None: axis = plt axis.plot( X[:,0],X[:,1], 'k.', visible=False ) # TODO: can probably replace with final dot plot and use transparency for image (?) ax = axis.axis() xticks = np.linspace(ax[0],ax[1],nGrid) yticks = np.linspace(ax[2],ax[3],nGrid) grid = np.meshgrid( xticks, yticks ) XGrid = np.column_stack( (grid[0].flatten(), grid[1].flatten()) ) if learner is not None: YGrid = learner.predict( pre(XGrid) ) #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses ) axis.imshow( YGrid.reshape( (len(xticks),len(yticks)) ), extent=axis.axis(), interpolation='nearest',origin='lower',alpha=0.5 ) cmap = plt.cm.get_cmap() # TODO: if Soft: predictSoft; get colors for each class from cmap; blend pred with colors & show # classes = np.unique(Y) cvals = (classes - min(classes))/(max(classes)-min(classes)+1e-100) for i,c in enumerate(classes): axis.plot( X[Y==c,0],X[Y==c,1], 'ko', color=cmap(cvals[i]), **kwargs )
def auc(self, X, Y): """ This method computes the area under the roc curve on the given test data. This method only works on binary classifiers. Paramters --------- X : N x M numpy array N = number of data points; M = number of features. Y : 1 x N numpy array Array of classes that refer to the data points in X. """ if len(self.classes) > 2: raise ValueError('This method can only supports binary classification ') try: # compute 'response' (soft binary classification score) soft = self.predict_soft(X)[:,1] # p(class = 2nd) except (AttributeError, IndexError): # or we can use 'hard' binary prediction if soft is unavailable soft = self.predict(X) n,d = twod(soft).shape if n == 1: soft = soft.flatten() else: soft = soft.T.flatten() sorted_soft = np.sort(soft) # sort data by score value indices = np.argsort(soft) Y = Y[indices] # find ties in the sorting score same = np.append(np.asarray(sorted_soft[0:-1] == sorted_soft[1:]), 0) n = len(soft) rnk = self.__compute_ties(n, same) # compute tied rank values # number of true negatives and positives n0 = sum(Y == self.classes[0]) n1 = sum(Y == self.classes[1]) if n0 == 0 or n1 == 0: raise ValueError('Data of both class values not found') # compute AUC using Mann-Whitney U statistic result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1) / 2) / n1 / n0 return result
def from1ofK(Y, values=None): """ Function that converts Y from 1-of-K ("1-hot") rep back to single col/row form. Parameters ---------- Y : arraylike Matrix to convert from 1-of-k rep. values : list (optional) List that specifies which values to use for which index. Returns ------- array Y in single row/col form. """ return Y.argmax(1) if not values else twod([values[i] for i in Y.argmax(1)]).T
def train(self, X, Y, initStep=1.0, stopTol=1e-4, stopIter=5000, plot=None): """ Train the logistic regression using stochastic gradient descent """ ## First do some bookkeeping and setup: self.theta,X,Y = twod(self.theta), arr(X), arr(Y) # convert to numpy arrays M,N = X.shape if Y.shape[0] != M: raise ValueError("Y must have the same number of data (rows) as X") self.classes = np.unique(Y) if len(self.classes) != 2: raise ValueError("Y should have exactly two classes (binary problem expected)") if self.theta.shape[1] != N+1: # if self.theta is empty, initialize it! self.theta = np.random.randn(1,N+1) # Some useful modifications of the data matrices: X1 = np.hstack((np.ones((M,1)),X)) # make data array with constant feature Y01 = toIndex(Y, self.classes) # convert Y to canonical "0 vs 1" classes it = 0 done = False Jsur = [] J01 = [] while not done: step = (2.0 * initStep) / (2.0 + it) # common 1/iter step size change for i in range(M): # for each data point i: ## TODO: compute zi = linear response of X[i,:] ## TODO: compute prediction yi ## TODO: compute soft response si = logistic( zi ) ## TODO: compute gradient of logistic loss wrt data point i: # Take a step down the gradient self.theta = self.theta - step * gradi # each pass, compute surrogate loss & error rates: J01.append( self.err(X,Y) ) ## TODO: compute surrogate loss (logistic negative log-likelihood) ## Jsur = sum_i [ (si log si) if yi==1 else ((1-si)log(1-si)) ] Jsur.append( NotImplemented ) ## TODO ... ## For debugging: print current parameters & losses # print self.theta, ' => ', Jsur[-1], ' / ', J01[-1] # raw_input() # pause for keystroke # check stopping criteria: it += 1 done = (it > stopIter) or ( (it>1) and (abs(Jsur[-1]-Jsur[-2])<stopTol) )
def from1ofK(Y, values=None): """ Function that converts Y from 1-of-K ("1-hot") rep back to single col/row form. Parameters ---------- Y : arraylike Matrix to convert from 1-of-k rep. values : list (optional) List that specifies which values to use for which index. Returns ------- array Y in single row/col form. """ return Y.argmax(1) if not values else twod( [values[i] for i in Y.argmax(1)]).T
def cross_validate(X, Y, n_folds, i_fold): """ Function that splits data for n-fold cross validation. Parameters ---------- X : numpy array N x M numpy array that contains data points. Y : numpy array 1 x N numpy array that contains labels that correspond to data points in X. n_folds : int Total number of data folds. i_fold : int The fold for which the current call of cross_validate will partition. Returns ------- to_return : (Xtr,Xte,Ytr,Yte) Tuple that contains (in this order) training data from X, testing data from X, training labels from Y, and testing labels from Y. """ Y = arr(Y).flatten() nx, dx = twod(X).shape ny = len(Y) idx = range(nx) if ny > 0: assert nx == ny, 'cross_validate: X and Y must have the same length' n = np.fix(nx / n_folds) te_start = int((i_fold - 1) * n) te_end = int((i_fold * n)) if (i_fold * n) <= nx else int(nx) test_range = list(range(te_start, te_end)) train_range = sorted(set(idx) - set(test_range)) to_return = (X[train_range, :], X[test_range, :]) if ny > 0: to_return += (Y[train_range], Y[test_range]) return to_return
def k_init(X, K, determ): """ Distance based initialization. Randomly choose a start point, then: if determ == True: choose point farthest from the clusters chosen so far, otherwise: randomly choose new points proportionally to their distance. Parameters ---------- X : numpy array See kmeans docstring. K : int See kmeans docstring. determ : bool See description. Returns ------- c : numpy array K x M array of cluster centers. """ m,n = twod(X).shape clusters = np.zeros((K,n)) clusters[0,:] = X[np.floor(np.random.rand() * m),:] # take random point as first cluster dist = np.sum(np.power((X - np.ones((m,1)) * clusters[0,:]), 2), axis=1).ravel() #print 'dist:',dist for i in range(1,K): #print dist #print np.argmax(dist) if determ: j = np.argmax(dist) # choose farthest point... else: pr = np.cumsum(np.array(dist)); # ...or choose a random point by distance pr = pr / pr[-1] j = np.where(np.random.rand() < pr)[0][0] clusters[i,:] = X[j,:] # update that cluster # update min distances new_dist = np.sum(np.power((X - np.ones((m,1)) * clusters[i,:]), 2), axis=1).ravel() dist = np.minimum(dist, new_dist) #print "dist",dist return clusters
def cross_validate(X, Y, n_folds, i_fold): """ Function that splits data for n-fold cross validation. Parameters ---------- X : numpy array N x M numpy array that contains data points. Y : numpy array 1 x N numpy array that contains labels that correspond to data points in X. n_folds : int Total number of data folds. i_fold : int The fold for which the current call of cross_validate will partition. Returns ------- to_return : (Xtr,Xte,Ytr,Yte) Tuple that contains (in this order) training data from X, testing data from X, training labels from Y, and testing labels from Y. """ Y = arr(Y).flatten() nx, dx = twod(X).shape ny = len(Y) idx = range(nx) if ny > 0: assert nx == ny, "cross_validate: X and Y must have the same length" n = np.fix(nx / n_folds) te_start = int((i_fold - 1) * n) te_end = int((i_fold * n)) if (i_fold * n) <= nx else int(nx) test_range = list(range(te_start, te_end)) train_range = sorted(set(idx) - set(test_range)) to_return = (X[train_range, :], X[test_range, :]) if ny > 0: to_return += (Y[train_range], Y[test_range]) return to_return
def k_init(X, K, determ): """ Distance based initialization. Randomly choose a start point, then: if determ == True: choose point farthest from the clusters chosen so far, otherwise: randomly choose new points proportionally to their distance. Parameters ---------- X : numpy array See kmeans docstring. K : int See kmeans docstring. determ : bool See description. Returns ------- c : numpy array K x M array of cluster centers. """ m,n = twod(X).shape clusters = np.zeros((K,n)) clusters[0,:] = X[int(np.floor(np.random.rand()) * m),:] # take random point as first cluster dist = np.sum(np.power((X - np.ones((m,1)) * clusters[0,:]), 2), axis=1).ravel() #print 'dist:',dist for i in range(1,K): #print dist #print np.argmax(dist) if determ: j = np.argmax(dist) # choose farthest point... else: pr = np.cumsum(np.array(dist)); # ...or choose a random point by distance pr = pr / pr[-1] j = np.where(np.random.rand() < pr)[0][0] clusters[i,:] = X[j,:] # update that cluster # update min distances new_dist = np.sum(np.power((X - np.ones((m,1)) * clusters[i,:]), 2), axis=1).ravel() dist = np.minimum(dist, new_dist) #print "dist",dist return clusters
def auc(self, X, Y): """ This method computes the area under the roc curve on the given test data. This method only works on binary classifiers. Parameters --------- X : M x N numpy array M = number of data points; N = number of features. Y : M x 1 numpy array Array of classes (targets) corresponding to the data points in X. """ if len(self.classes) != 2: raise ValueError('This method can only supports binary classification ') try: # compute 'response' (soft binary classification score) soft = self.predictSoft(X)[:,1] # p(class = 2nd) except (AttributeError, IndexError): # or we can use 'hard' binary prediction if soft is unavailable soft = self.predict(X) n,d = twod(soft).shape # ensure soft is the correct shape soft = soft.flatten() if n==1 else soft.T.flatten() indices = np.argsort(soft) # sort data by score value Y = Y[indices] sorted_soft = soft[indices] # compute rank (averaged for ties) of sorted data dif = np.hstack( ([True],np.diff(sorted_soft),[True]) ) r1 = np.argwhere(dif).flatten() r2 = r1[0:-1] + 0.5*(r1[1:]-r1[0:-1]) + 0.5 rnk = r2[np.cumsum(dif[:-1])-1] # number of true negatives and positives n0,n1 = sum(Y == self.classes[0]), sum(Y == self.classes[1]) if n0 == 0 or n1 == 0: raise ValueError('Data of both class values not found') # compute AUC using Mann-Whitney U statistic result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1) / 2) / n1 / n0 return result
def auc(self, X, Y): """Compute the area under the roc curve on the given test data. Args: X (arr): M,N array of M data points with N features each Y (arr): M, or M,1 array of target class values for each data point Returns: float: Area under the ROC curve This method only works on binary classifiers. """ if len(self.classes) != 2: raise ValueError('This method can only supports binary classification ') try: # compute 'response' (soft binary classification score) soft = self.predictSoft(X)[:,1] # p(class = 2nd) except (AttributeError, IndexError): # or we can use 'hard' binary prediction if soft is unavailable soft = self.predict(X) n,d = twod(soft).shape # ensure soft is the correct shape soft = soft.flatten() if n==1 else soft.T.flatten() indices = np.argsort(soft) # sort data by score value Y = Y[indices] sorted_soft = soft[indices] # compute rank (averaged for ties) of sorted data dif = np.hstack( ([True],np.diff(sorted_soft)!=0,[True]) ) r1 = np.argwhere(dif).flatten() r2 = r1[0:-1] + 0.5*(r1[1:]-r1[0:-1]) + 0.5 rnk = r2[np.cumsum(dif[:-1])-1] # number of true negatives and positives n0,n1 = sum(Y == self.classes[0]), sum(Y == self.classes[1]) if n0 == 0 or n1 == 0: raise ValueError('Data of both class values not found') # compute AUC using Mann-Whitney U statistic result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1.0) / 2.0) / n1 / n0 return result
def predictSoft(self, X): """ This method makes a "soft" linear classification predition on the data Uses a (multi)-logistic function to convert linear response to [0,1] confidence Parameters ---------- X : M x N numpy array M = number of testing instances; N = number of features. """ theta, X = twod(self.theta), arr(X) # convert to numpy if needed resp = theta[:, 0].T + X.dot(theta[:, 1:].T) # linear response (MxC) prob = np.exp(resp) if resp.shape[1] == 1: # binary classification (C=1) prob /= prob + 1.0 # logistic transform (binary classification; C=1) prob = np.hstack((1 - prob, prob)) # make a column for each class else: prob /= np.sum(prob, axis=1) # normalize each row (for multi-class) return prob
def crossValidate(X, Y=None, K=5, i=0): """ Create a K-fold cross-validation split of a data set: crossValidate(X,Y, 5, i) : return the ith of 5 80/20 train/test splits Parameters ---------- X : MxN numpy array of data points to be resampled. Y : Mx1 numpy array of labels associated with each datum (optional) K : number of folds of cross-validation i : current fold to return (0...K-1) Returns ------- Xtr,Xva,Ytr,Yva : (tuple of) numpy arrays for the split data set If Y is not present or None, returns only Xtr,Xva """ nx, dx = twod(X).shape start = round(float(nx) * i / K) end = round(float(nx) * (i + 1) / K) Xte = X[start:end, :] Xtr = np.vstack((X[0:start, :], X[end:, :])) to_return = (Xtr, Xte) Y = arr(Y).flatten() ny = len(Y) if ny > 0: assert ny == nx, 'crossValidate: X and Y must have the same length' if Y.ndim <= 1: Yte = Y[start:end] Ytr = np.hstack((Y[0:start], Y[end:])) else: # in case targets are multivariate Yte = Y[start:end, :] Ytr = np.vstack((Y[0:start, :], Y[end:, :])) to_return += (Ytr, Yte) return to_return
def fpoly(X, degree, bias=True): """ Create expanded polynomial features of up to a given degree. Parameters ---------- X : MxN numpy array of data (each row one data point) degree : int, the polynomial degree bias : bool, include constant feature if true (default) Returns ------- Xext : MxN' numpy array with each data point's higher order features """ n, m = twod(X).shape if (degree + 1)**(m) > 1e7: err_string = 'fpoly: {}**{} = too many potential output features'.format( degree + 1, m) raise ValueError(err_string) if m == 1: # faster shortcut for scalar data p = arr(range(0, degree + 1)) Xext = np.power(np.tile(X, (1, len(p))), np.tile(p, (n, 1))) else: K = 0 for i in range((degree + 1)**(m)): powers = np.unravel_index(i, (degree + 1, ) * m) if sum(powers) > degree: continue K += 1 Xext = np.zeros((n, K)) k = 0 for i in range((degree + 1)**(m)): powers = np.unravel_index(i, (degree + 1, ) * m) if sum(powers) > degree: continue Xext[:, k] = np.prod(X**list(powers), axis=1) k += 1 return Xext if bias else Xext[:, 1:]
def bootstrapData(X, Y=None, n_boot=None): """ Bootstrap resample a data set (with replacement): draw data points (x_i,y_i) from (X,Y) n_boot times. Parameters ---------- X : MxN numpy array of data points to be resampled. Y : Mx1 numpy array of labels associated with each datum (optional) n_boot : int, number of samples to draw (default: M) Returns ------- Xboot, Yboot : (tuple of) numpy arrays for the resampled data set If Y is not present or None, returns only Xboot (non-tuple) """ nx, dx = twod(X).shape if n_boot is None: n_boot = nx idx = np.floor(np.random.rand(n_boot) * nx).astype(int) if Y is None: return X[idx, :] Y = Y.flatten() assert nx == len(Y), 'bootstrapData: X and Y should have the same length' return (X[idx, :], Y[idx])
def fpoly_mono(X, degree, use_constant=True): """ Create polynomial features of each individual feature (no cross products). Parameters ---------- X : numpy array N x M array of data. degree : int The degree. use_constant : bool (optional) If True (default), include a constant feature. Returns ------- Xext : numpy array N x M * degree (+ 1 if use_constant) array of polynomial features from X. TODO: test more """ m, n = twod(X).shape if use_constant: Xext = np.zeros((m, n * degree + 1)) Xext[:, 0] = 1 k = 1 else: Xext = np.zeros((m, n * degree)) k = 0 for p in range(degree): for j in range(n): Xext[:, k] = np.power(X[:, j], p + 1) k += 1 return Xext
def gmm_draw_params(m, c, n=2, scale=.05): """Create a random Gaussian mixture model. Builds a random GMM with C components and draws M data x^{(i)} from a mixture of Gaussians in D dimensions Args: m (int): Number of data to be drawn from a mixture of Gaussians. c (int): Number of clusters. n (int): Number of dimensions. scale (float): relative scale of the inter- to intra-cluster variance (small = clumpy) Returns: tuple of tuples, (pi,mu,sig) = mixture weight, mean, and covariance of each component """ pi = np.zeros(c) for cc in range(c): pi[cc] = gamrand(10, 0.5) pi = pi / np.sum(pi) rho = np.random.rand(n, n) rho = rho + twod(rho).T rho = rho + n * np.eye(n) rho = sqrtm(rho) mu = np.random.randn(c, n).dot(rho) ccov = [] for i in range(c): tmp = np.random.rand(n, n) tmp = tmp + tmp.T tmp = scale * (tmp + n * np.eye(n)) ccov.append(tmp) #print(pi,mu,ccov) return tuple((pi[cc], mu[cc], ccov[cc]) for cc in range(c))
def split_data(X, Y, train_fraction): """ Split data into training and test data. Parameters ---------- X : numpy array N x M array of data to split. Y : numpy arra 1 x N array of labels that correspond to data in X. train_fraction : float Fraction of data to use for training. Returns ------- to_return : (Xtr,Xte,Ytr,Yte) or (Xtr,Xte) A tuple containing the following arrays (in order): training data from X, testing data from X, training labels from Y (if Y contains data), and testing labels from Y (if Y contains data). """ nx, dx = twod(X).shape ne = round(train_fraction * nx) Xtr, Xte = X[:ne, :], X[ne:, :] to_return = (Xtr, Xte) Y = arr(Y).flatten() ny = len(Y) if ny > 0: assert ny == nx, 'split_data: X and Y must have the same length' Ytr, Yte = Y[:ne], Y[ne:] to_return += (Ytr, Yte) return to_return
def fhash(X, K, hash=None): """ Random hash of features from data. Selects a fixed or random hash of features from X. Parameters ---------- X : numpy array M x N numpy array containing data. K : int Number of features to select. hash : function object (optional) Hash function to use. If provided, 'hash' uses fixed hash. Returns ------- Z : numpy array M x K array of hashed features of X. hash : hash function (optional) Hash function used to hash features. Only returned if 'hash' argument isn't provided. """ to_return = () n, m = twod(X).shape if hash is None: # TODO : not what we want ?! hash = lambda i: np.floor(np.random.rand(m) * K)[i] to_return = (hash, ) # do the hashing Z = np.zeros((n, K)) for i in range(m): Z[:, hash(i)] = Z[:, hash(i)] + X[:, i] return Z if len(to_return) == 0 else (Z, ) + to_return
def agglomerative(X, K, method='means', join=None): """ Perform hierarchical agglomerative clustering. Parameters ---------- X : numpy array N x M array of Data to be clustered. K : int The number of clusters into which data should be grouped. method : str (optional) str that specifies the method to use for calculating distance between clusters. Can be one of: 'min', 'max', 'means', or 'average'. join : numpy array (optional) N - 1 x 3 that contains a sequence of joining operations. Pass to avoid reclustering for new X. to_return : [bool] (optional) Array of bools that specifies which values to return. The bool at to_return[0] indicates whether z should be returned; the bool at to_return[1] indicates whether join should be returned. Returns (tuple) ------- z : N x 1 array of cluster assignments. join : N - 1 x 3 array that contains the sequence of joining operations peformed by the clustering algorithm. """ m, n = twod(X).shape # get data size D = np.zeros( (m, m) ) + np.inf # store pairwise distances b/w clusters (D is an upper triangular matrix) z = arr(range(m)) # assignments of data num = np.ones(m) # number of data in each cluster mu = arr(X) # centroid of each cluster method = method.lower() if type(join) == type(None): # if join not precomputed join = np.zeros((m - 1, 3)) # keep track of join sequence # use standard Euclidean distance dist = lambda a, b: np.sum(np.power(a - b, 2)) for i in range(m): # compute initial distances for j in range(i + 1, m): D[i][j] = dist(X[i, :], X[j, :]) opn = np.ones(m) # store list of clusters still in consideration val, k = np.min(D), np.argmin( D) # find first join (closest cluster pair) for c in range(m - 1): i, j = np.unravel_index(k, D.shape) join[c, :] = arr([i, j, val]) # centroid of new cluster mu_new = (num[i] * mu[i, :] + num[j] * mu[j, :]) / (num[i] + num[j]) # compute new distances to cluster i for jj in np.where(opn)[0]: if jj in [i, j]: continue # sort indices because D is an upper triangluar matrix idxi = tuple(sorted((i, jj))) idxj = tuple(sorted((j, jj))) if method == 'min': D[idxi] = min(D[idxi], D[idxj]) # single linkage (min dist) elif method == 'max': D[idxi] = max(D[idxi], D[idxj]) # complete linkage (max dist) elif method == 'means': D[idxi] = dist( mu_new, mu[jj, :]) # mean linkage (dist b/w centroids) elif method == 'average': # average linkage D[idxi] = (num[i] * D[idxi] + num[j] * D[idxj]) / (num[i] + num[j]) opn[j] = 0 # close cluster j (fold into i) num[i] = num[i] + num[ j] # update total membership in cluster i to include j mu[i, :] = mu_new # update centroid list # remove cluster j from consideration as min for ii in range(m): if ii != j: # sort indices because D is an upper triangular matrix idx = tuple(sorted((ii, j))) D[idx] = np.inf val, k = np.min(D), np.argmin(D) # find next smallext pair # compute cluster assignments given sequence of joins for c in range(m - K): z[z == join[c, 1]] = join[c, 0] uniq = np.unique(z) for c in range(len(uniq)): z[z == uniq[c]] = c return z, join
def kmeans(X, K, init='random', max_iter=100): """ Perform K-means clustering on data X. Parameters ---------- X : numpy array N x M array containing data to be clustered. K : int Number of clusters. init : str or array (optional) Either a K x N numpy array containing initial clusters, or one of the following strings that specifies a cluster init method: 'random' (K random data points (uniformly) as clusters), 'farthest' (choose cluster 1 uniformly, then the point farthest from all cluster so far, etc.), or 'k++' (choose cluster 1 uniformly, then points randomly proportional to distance from current clusters). max_iter : int (optional) Maximum number of optimization iterations. Returns (as tuple) ------- z : N x 1 array containing cluster numbers of data at indices in X. c : K x M array of cluster centers. sumd : (scalar) sum of squared euclidean distances. """ n, d = twod(X).shape # First, initialize the clusters to something: if type(init) is str: init = init.lower() if init == 'random': pi = np.random.permutation(n) c = X[pi[0:K], :] elif init == 'farthest': c = k_init(X, K, True) elif init == 'k++': c = k_init(X, K, False) else: raise ValueError('kmeans: value for "init" ( ' + init + ') is invalid') else: c = init # Now, optimize the objective using coordinate descent: iter = 1 done = (iter > max_iter) sumd = np.inf sum_old = np.inf z = np.zeros((n, )) #print c while not done: sumd = 0 for i in range(n): # compute distances for each cluster center dists = np.sum((c - twod(X[i, :]))**2, axis=1) #dists = np.sum(np.power((c - np.tile(X[i,:], (K,1))), 2), axis=1) val = np.min(dists, axis=0) # assign datum i to nearest cluster z[i] = np.argmin(dists, axis=0) sumd = sumd + val #print z for j in range(K): # now update each cluster center j... if np.any(z == j): c[j, :] = np.mean( X[(z == j).flatten(), :], 0) # ...to be the mean of the assigned data... else: c[j, :] = X[int(np.floor(np.random.rand( ))), :] # ...or random restart if no assigned data done = (iter > max_iter) or (sumd == sum_old) sum_old = sumd iter += 1 return z, c, sumd
def fpoly_pair(X, degree, use_constant=True): """ Create polynomial features of each individual feature (too many cross products). Parameters ---------- X : numpy array N x M array of data. degree : int The degree. use_constant : bool (optional) If True (default), include a constant feature. Returns ------- Xext : numpy array TODO: test more """ m, n = twod(X).shape npoly = np.ceil( (n**(degree + 1) - 1) / (n - 1)) # ceil to fix possible roundoff error if use_constant: Xext = np.zeros((m, npoly)) Xext[:, 0] = 1 Xcur = 1 k = 1 else: Xext = np.zeros((m, npoly - 1)) Xcur = 1 k = 0 # hard coded to be a shorter length if degree == 2: Xext[:, k:k + n] = X k += n Z = np.reshape(X, (m, 1, n)) X2 = np.zeros((m, 1)) for i in range(twod(Z).shape[2]): X2 = cols((X2, X * Z[:, :, i])) X2 = X2[:, 1:] idx = np.where((twod(arr(range(1, n + 1))).T >= arr(range( 1, n + 1))).T.ravel())[0] K = len(idx) Xext[:, k:k + K] = X2[:, idx] return Xext[:, 0:k + K] for p in range(degree): # workaround to make up for numpy's lack of bsxfun if type(Xcur) is int: Xcur = X * Xcur else: new_Xcur = np.zeros((m, 1)) for i in range(Xcur.shape[2]): new_Xcur = cols((new_Xcur, X * Xcur[:, :, i])) Xcur = new_Xcur[:, 1:] Xcur = Xcur.reshape((m, np.size(Xcur) / m)) K = Xcur.shape[1] Xext[:, k:k + K] = Xcur k = k + K Xcur = Xcur.reshape((m, 1, K)) return Xext
def train(self, X, Y, init='zeros', stepsize=.01, stopTol=1e-4, stopIter=5000): """Train the neural network. Args: X : MxN numpy array containing M data points with N features each Y : Mx1 numpy array of targets (class labels) for each data point in X sizes : [Nin, Nh1, ... , Nout] Nin is the number of features, Nout is the number of outputs, which is the number of classes. Member weights are {W1, ... , WL-1}, where W1 is Nh1 x Nin, etc. init : str 'none', 'zeros', or 'random'. inits the neural net weights. stepsize : scalar The stepsize for gradient descent (decreases as 1 / iter). stopTol : scalar Tolerance for stopping criterion. stopIter : int The maximum number of steps before stopping. activation : str 'logistic', 'htangent', or 'custom'. Sets the activation functions. """ if self.wts[0].shape[1] - 1 != len(X[0]): raise ValueError( 'layer[0] must equal the number of columns of X (number of features)' ) self.classes = self.classes if len(self.classes) else np.unique(Y) if len(self.classes) != self.wts[-1].shape[ 0]: # and (self.wts[-1].shape[0]!=1 or len(self.classes)!=2): raise ValueError( 'layers[-1] must equal the number of classes in Y, or 1 for binary Y' ) M, N = mat(X).shape # d = dim of data, n = number of data points C = len(self.classes) # number of classes L = len(self.wts) # get number of layers Y_tr_k = to1ofK(Y, self.classes) # convert Y to 1-of-K format # outer loop of stochastic gradient descent it = 1 # iteration number nextPrint = 1 # next time to print info done = 0 # end of loop flag J01, Jsur = [], [] # misclassification rate & surrogate loss values while not done: step_i = float( stepsize) / it # step size evolution; classic 1/t decrease # stochastic gradient update (one pass) for j in range(M): A, Z = self.__responses(twod( X[j, :])) # compute all layers' responses, then backdrop delta = (Z[L] - Y_tr_k[j, :]) * arr(self.dSig0( Z[L])) # take derivative of output layer for l in range(L - 1, -1, -1): grad = delta.T.dot( Z[l]) # compute gradient on current layer wts delta = delta.dot(self.wts[l]) * arr(self.dSig( Z[l])) # propagate gradient down delta = delta[:, 1:] # discard constant feature self.wts[ l] -= step_i * grad # take gradient step on current layer wts J01.append(self.err_k(X, Y_tr_k)) # error rate (classification) Jsur.append(self.mse_k(X, Y_tr_k)) # surrogate (mse on output) if it >= nextPrint: print('it {} : Jsur = {}, J01 = {}'.format( it, Jsur[-1], J01[-1])) nextPrint *= 2 # check if finished done = (it > 1) and (np.abs(Jsur[-1] - Jsur[-2]) < stopTol) or it >= stopIter it += 1
if Y is not None: Y = arr(Y).flatten() ny = len(Y) if ny > 0: assert ny == nx, 'splitData: X and Y must have the same length' Ytr, Yte = Y[:ne], Y[ne:] to_return += (Ytr, Yte) return to_return if __name__ == "__main__": amazon = np.genfromtxt("amazon_cells_labelled.txt", delimiter="\t", dtype=None, encoding='utf-8') # amazon = [(string,0),(string,1)....] tupleX, tupleY = zip(*amazon) #tupleX = (string1, string2, ...) #tupleY = (1,0,1,1,0,...) X = np.array(tupleX) Y = np.array(tupleY) X = twod(X).T Xtr, Xte, Ytr, Yte = splitData(X, Y, .75) """ model = Sequential() model.add(Dense(256,activation='relu',input_shape=(999,),W_regularizer=12(0.001))) model.add(Dense(256,activation='relu')) model.add(Dense(10,activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) """
def plotClassify2D(learner, X, Y, pre=lambda x: x, axis=None, nGrid=128, **kwargs): """ Plot data and classifier outputs on two-dimensional data. This function plot data (X,Y) and learner.predict(X, Y) together. The learner is is predicted on a dense grid covering data X, to show its decision boundary. Parameters ---------- learner : learner object A trained learner object that inherits from one of the 'Classify' or 'Regressor' base classes. X : numpy array N x M array of data; N = number of data, M = dimension (number of features) of data. Y : numpy array 1 x N arra containing labels corresponding to data points in X. pre : function object (optional) Function that is applied to X before prediction. axis : a matplotlib axis / plottable object (optional) nGrid : density of 2D grid points (default 128) """ if twod(X).shape[1] != 2: raise ValueError( 'plotClassify2D: function can only be called using two-dimensional data (features)' ) # TODO: Clean up code if axis == None: axis = plt # hld = axis.ishold(); # axis.hold(True); axis.plot(X[:, 0], X[:, 1], 'k.', visible=False) # TODO: can probably replace with final dot plot and use transparency for image (?) ax = axis.axis() xticks = np.linspace(ax[0], ax[1], nGrid) yticks = np.linspace(ax[2], ax[3], nGrid) grid = np.meshgrid(xticks, yticks) XGrid = np.column_stack((grid[0].flatten(), grid[1].flatten())) if learner is not None: YGrid = learner.predict(pre(XGrid)) #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses ) axis.imshow(YGrid.reshape((len(xticks), len(yticks))), extent=ax, interpolation='nearest', origin='lower', alpha=0.5, aspect='auto') cmap = plt.cm.get_cmap() # TODO: if Soft: predictSoft; get colors for each class from cmap; blend pred with colors & show # try: classes = np.array(learner.classes) except Exception: classes = np.unique(Y) cvals = (classes - min(classes)) / (max(classes) - min(classes) + 1e-100) for i, c in enumerate(classes): axis.plot(X[Y == c, 0], X[Y == c, 1], 'ko', color=cmap(cvals[i]), **kwargs) axis.axis(ax)
def gmmEM(X, K, init='random', max_iter=100, tol=1e-6): """ Perform Gaussian mixture EM (expectation-maximization) clustering on data X. Parameters ---------- X : numpy array N x M array containing data to be clustered. K : int Number of clusters. init : str or array (optional) Either a K x N numpy array containing initial clusters, or one of the following strings that specifies a cluster init method: 'random' (K random data points (uniformly) as clusters) 'farthest' (choose cluster 1 uniformly, then the point farthest from all cluster so far, etc.) 'k++' (choose cluster 1 uniformly, then points randomly proportional to distance from current clusters). max_iter : int (optional) Maximum number of iterations. tol : scalar (optional) Stopping tolerance. Returns ------- z : 1 x N numpy array of cluster assignments (int indices). T : {'pi': np.array, 'mu': np.array, 'sig': np.array} : Gaussian component parameters soft : numpy array; soft assignment probabilities (rounded for assign) ll : float; Log-likelihood under the returned model. """ # init N, D = twod(X).shape # get data size if type(init) is str: init = init.lower() if init == 'random': pi = np.random.permutation(N) mu = X[pi[0:K], :] elif init == 'farthest': mu = k_init(X, K, True) elif init == 'k++': mu = k_init(X, K, False) else: raise ValueError('gmmEM: value for "init" ( ' + init + ') is invalid') else: mu = init sig = np.zeros((D, D, K)) for c in range(K): sig[:, :, c] = np.eye(D) alpha = np.ones(K) / K R = np.zeros((N, K)) iter, ll, ll_old = 1, np.inf, np.inf done = iter > max_iter C = np.log(2 * np.pi) * D / 2 while not done: ll = 0 for c in range(K): # compute log prob of all data under model c V = X - np.tile(mu[c, :], (N, 1)) R[:, c] = -0.5 * np.sum( (V.dot(np.linalg.inv(sig[:, :, c]))) * V, axis=1) - 0.5 * np.log(np.linalg.det(sig[:, :, c])) + np.log( alpha[c]) - C # avoid numberical issued by removing constant 1st mx = R.max(1) R -= np.tile(twod(mx).T, (1, K)) # exponentiate and compute sum over components R = np.exp(R) nm = R.sum(1) # update log-likelihood of data ll = np.sum(np.log(nm) + mx) R /= np.tile(twod(nm).T, (1, K)) # normalize to give membership probabilities alpha = R.sum(0) # total weight for each component for c in range(K): # weighted mean estimate mu[c, :] = (R[:, c] / alpha[c]).T.dot(X) tmp = X - np.tile(mu[c, :], (N, 1)) # weighted covar estimate sig[:, :, c] = tmp.T.dot( tmp * np.tile(twod(R[:, c]).T / alpha[c], (1, D))) + 1e-32 * np.eye(D) alpha /= N # stopping criteria done = (iter >= max_iter) or np.abs(ll - ll_old) < tol ll_old = ll iter += 1 z = from1ofK(R) soft = R T = {'pi': alpha, 'mu': mu, 'sig': sig} return z, T, soft, ll
def data_GMM(N, C, D=2, get_Z=False): """ Sample data from a Gaussian mixture model. Draws N data x_i from a mixture of Gaussians, with C clusters in D dimensions. Parameters ---------- N : int Number of data to be drawn from a mixture of Gaussians. C : int Number of clusters. D : int Number of dimensions. get_Z : bool If True, returns a an array indicating the cluster from which each data point was drawn. Returns ------- X : numpy array N x D array of data. Z : numpy array (optional) 1 x N array of cluster ids. TODO: test more """ C += 1 pi = np.zeros(C) for c in range(C): pi[c] = gamrand(10, 0.5) pi = pi / np.sum(pi) cpi = np.cumsum(pi) rho = np.random.rand(D, D) rho = rho + twod(rho).T rho = rho + D * np.eye(D) rho = sqrtm(rho) mu = mat(np.random.randn(c, D)) * mat(rho) ccov = [] for i in range(C): tmp = np.random.rand(D, D) tmp = tmp + tmp.T tmp = 0.5 * (tmp + D * np.eye(D)) ccov.append(sqrtm(tmp)) p = np.random.rand(N) Z = np.ones(N) for c in range(C - 1): Z[p > cpi[c]] = c Z = Z.astype(int) X = mu[Z, :] for c in range(C): X[Z == c, :] = X[Z == c, :] + mat(np.random.randn(np.sum(Z == c), D)) * mat(ccov[c]) if get_Z: return (arr(X), Z) else: return arr(X)
def train(self, X, Y, initStep=1.0, stopTol=1e-4, stopIter=5000, plot=None): """ Train the logistic regression using stochastic gradient descent """ ## First do some bookkeeping and setup: self.theta, X, Y = twod(self.theta), arr(X), arr( Y) # convert to numpy arrays M, N = X.shape if Y.shape[0] != M: raise ValueError("Y must have the same number of data (rows) as X") self.classes = np.unique(Y) if len(self.classes) != 2: raise ValueError( "Y should have exactly two classes (binary problem expected)") if self.theta.shape[ 1] != N + 1: # if self.theta is empty, initialize it! self.theta = np.random.randn(1, N + 1) # Some useful modifications of the data matrices: X1 = np.hstack((np.ones( (M, 1)), X)) # make data array with constant feature Y01 = toIndex(Y, self.classes) # convert Y to canonical "0 vs 1" classes it = 0 done = False Jsur = [] J01 = [] while not done: step = (2.0 * initStep) / (2.0 + it ) # common 1/iter step size change si = [] for i in range(M): # for each data point i: ## Computing the linear response zi = X1[i, :].dot(self.theta.T) ## Computing the prediction yi yi = Y01[i] ## Computing soft response si.append(self.logistic(zi)) ## Computing gradient of logistic loss gradi = (si[i] - yi) * X1[i, :] # Take a step down the gradient self.theta = self.theta - step * gradi # each pass, compute surrogate loss & error rates: J01.append(self.err(X, Y)) ## Computing surrogate loss sum_i = 0 for i in range(M): sum_i += Y01[i] * si[i] * np.log( si[i]) + (1 - Y01[i]) * (1 - si[i]) * np.log(1 - si[i]) Jsur.append(sum_i / M) ## TODO ... ## For debugging: print current parameters & losses # print self.theta, ' => ', Jsur[-1], ' / ', J01[-1] # raw_input() # pause for keystroke # check stopping criteria: it += 1 done = (it > stopIter) or ((it > 1) and (abs(Jsur[-1] - Jsur[-2]) < stopTol)) self.numberOfIterations = it if self.plotFlag == True: plt.semilogx(range(it), np.abs(Jsur), label='Surrogate Loss') plt.semilogx(range(it), np.abs(J01), label='Error Rate') plt.legend(loc='upper right') plt.xlabel('# of iterations') plt.ylabel('Losses') plt.show()
def plotClassify2D(learner, X, Y, pre=lambda x: x, ax=None, nGrid=128, cm=None, bgalpha=0.3, soft=False, **kwargs): """ Plot data and classifier outputs on two-dimensional data. This function plots data (X,Y) and learner.predict(X, Y) together. The learner is is predicted on a dense grid covering the data X, to show its decision boundary. Parameters ---------- learner : a classifier with "predict" function and optionally "classes" list X : (m,n) numpy array of data (m points in n=2 dimension) Y : (m,) or (m,1) int array of class values OR (m,c) array of class probabilities (see predictSoft) pre : function object (optional) applied to X before learner.predict() ax : a matplotlib axis / plottable object (optional) nGrid : density of 2D grid points (default 128) soft : use predictSoft & blend colors (default: False => use predict() and show decision regions) bgalpha: alpha transparency (1=opaque, 0=transparent) for decision function image cm : pyplot colormap (default: None = use default colormap) [other arguments will be passed through to the pyplot scatter function on the data points] """ if twod(X).shape[1] != 2: raise ValueError( 'plotClassify2D: function can only be called using two-dimensional data (features)' ) # make robust to differing arguments in scatter vs plot, e.g. "s"/"ms" (marker size) if "s" not in kwargs and "ms" in kwargs: kwargs["s"] = kwargs.pop("ms") try: classes = np.array(learner.classes) # learner has explicit list of classes; use those except Exception: if len(Y.shape) == 1 or Y.shape[1] == 1: classes = np.unique( Y) # or, use data points' class values to guess else: classes = np.arange(Y.shape[1], dtype=int) # or, get number of classes from soft predictions vmin, vmax = classes.min() - .1, classes.max() + .1 # get (slightly expanded) value range for class values if ax is None: ax = plt.gca() # default: use current axes if cm is None: cm = plt.cm.get_cmap() # get the colormap classvals = (classes - vmin) / (vmax - vmin + 1e-100) # map class values to [0,1] for colormap classcolor = cm(classvals) # and get the RGB values for each class ax.plot(X[:, 0], X[:, 1], 'k.', visible=False, ms=0) # invisible plot to set axis range if required axrng = ax.axis() if learner is not None: # if we were given a learner to predict with: xticks, yticks = np.linspace(axrng[0], axrng[1], nGrid), np.linspace( axrng[2], axrng[3], nGrid) grid = np.meshgrid(xticks, yticks) # apply it to a dense grid of points XGrid = np.column_stack((grid[0].flatten(), grid[1].flatten())) if soft: YGrid = learner.predictSoft(pre(XGrid)).dot(classcolor) # soft prediction: blend class colors YGrid[YGrid < 0] = 0 YGrid[YGrid > 1] = 1 YGrid = YGrid.reshape((nGrid, nGrid, classcolor.shape[1])) #axis.contourf( xticks,yticks,YGrid[:,0].reshape( (len(xticks),len(yticks)) ), nContours ) ax.imshow(YGrid, extent=axrng, interpolation='nearest', origin='lower', alpha=bgalpha, aspect='auto', vmin=vmin, vmax=vmax, cmap=cm) else: YGrid = learner.predict(pre(XGrid)).reshape((nGrid, nGrid)) # hard prediction: use class colors vmin, vmax = min(YGrid.min() - .1, vmin), max(YGrid.max() + .1, vmax) # check outputs for new classes? classvals = (classes - vmin) / (vmax - vmin + 1e-100) classcolor = cm(classvals) # if so, recalc colors? #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses ) ax.imshow(YGrid, extent=axrng, interpolation='nearest', origin='lower', alpha=bgalpha, aspect='auto', vmin=vmin, vmax=vmax, cmap=cm) if len(Y.shape) == 1 or Y.shape[1] == 1: data_colors = classcolor[np.searchsorted(classes, Y)] # use colors if Y is discrete class else: data_colors = Y.dot(classcolor) data_colors[data_colors > 1] = 1 # blend colors if Y is a soft confidence ax.scatter(X[:, 0], X[:, 1], c=data_colors, **kwargs)
def train(self, X, Y, init='zeros', stepsize=.01, stopTol=1e-4, stopIter=5000): """Train the neural network. Args: X : MxN numpy array containing M data points with N features each Y : Mx1 numpy array of targets for each data point in X sizes (list of int): [Nin, Nh1, ... , Nout] Nin is the number of features, Nout is the number of outputs, which is the number of target dimensions (usually 1). Weights are {W1, ... , WL-1}, where W1 is Nh1 x Nin, etc. init (str): 'none', 'zeros', or 'random'. inits the neural net weights. stepsize (float): The stepsize for gradient descent (decreases as 1 / iter). stopTol (float): Tolerance for stopping criterion. stopIter (int): The maximum number of steps before stopping. activation (str): 'logistic', 'htangent', or 'custom'. Sets the activation functions. """ if self.wts[0].shape[1] - 1 != len(X[0]): raise ValueError( 'layer[0] must equal the number of columns of X (number of features)' ) if self.wts[-1].shape[0] > 1 and self.wts[-1].shape[0] != Y.shape[1]: raise ValueError( 'layers[-1] must equal the number of classes in Y, or 1 for binary Y' ) M, N = arr(X).shape # d = dim of data, n = number of data points L = len(self.wts) # get number of layers Y = arr(Y) Y2d = Y if len(Y.shape) > 1 else Y[:, np.newaxis] # outer loop of stochastic gradient descent it = 1 # iteration number nextPrint = 1 # next time to print info done = 0 # end of loop flag Jsur = [] # misclassification rate & surrogate loss values while not done: step_i = (2.0 * stepsize) / ( 2.0 + it) # step size evolution; classic 1/t decrease # stochastic gradient update (one pass) for j in range(M): A, Z = self.__responses(twod( X[j, :])) # compute all layers' responses, then backdrop delta = (Z[L] - Y2d[j, :]) * arr(self.dSig0( Z[L])) # take derivative of output layer for l in range(L - 1, -1, -1): grad = delta.T.dot( Z[l]) # compute gradient on current layer wts delta = delta.dot(self.wts[l]) * arr(self.dSig( Z[l])) # propagate gradient down delta = delta[:, 1:] # discard constant feature self.wts[ l] -= step_i * grad # take gradient step on current layer wts Jsur.append(self.mse(X, Y2d)) # surrogate (mse on output) if it >= nextPrint: print('it {} : J = {}'.format(it, Jsur[-1])) nextPrint *= 2 # check if finished done = (it > 1) and (np.abs(Jsur[-1] - Jsur[-2]) < stopTol) or it >= stopIter it += 1
def train(self, X, Y, initStep=1.0, stopTol=1e-4, stopIter=5000, plot=None): """ Train the logistic regression using stochastic gradient descent """ ## First do some bookkeeping and setup: self.theta,X,Y = twod(self.theta), arr(X), arr(Y) # convert to numpy arrays M,N = X.shape if Y.shape[0] != M: raise ValueError("Y must have the same number of data (rows) as X") self.classes = np.unique(Y) if len(self.classes) != 2: raise ValueError("Y should have exactly two classes (binary problem expected)") if self.theta.shape[1] != N+1: # if self.theta is empty, initialize it! self.theta = np.random.randn(1,N+1) # Some useful modifications of the data matrices: X1 = np.hstack((np.ones((M,1)),X)) # make data array with constant feature Y01 = toIndex(Y, self.classes) # convert Y to canonical "0 vs 1" classes it = 0 done = False Jsur = [] J01 = [] Jloss = 0 while not done: step = (2.0 * initStep) / (2.0 + it) # common 1/iter step size change for i in range(M): # for each data point i: ## compute zi = linear response of X[i,:] zi = np.dot(np.array(X1[i,:]),np.transpose(self.theta)) ## compute prediction yi ## compute soft response si = logistic(zi) si = (math.exp(zi))/(1+math.exp(zi)) if si >= 0.5: yi = 1 else: yi = 0 ## compute gradient of logistic loss wrt data point i: gradi = (si - Y01[i])*X1[i,:] # Take a step down the gradient self.theta = self.theta - step * gradi Ji = -yi*math.log(si) - (1-yi)*math.log(1-si) Jloss = Jloss+Ji if it == 0: self.theta = list(self.theta[0]) elif it >= 1: self.theta = list(self.theta) # each pass, compute surrogate loss & error rates: J01.append( self.err(X,Y) ) ## compute surrogate loss (logistic negative log-likelihood) Jloss = Jloss/M ## Jsur = sum_i [ (si log si) if yi==1 else ((1-si)log(1-si)) ] Jsur.append(Jloss) ## TODO ... ## For debugging: print current parameters & losses # print(self.theta, ' => ', Jsur[-1], ' / ', J01[-1]) # raw_input() # pause for keystroke # check stopping criteria: it += 1 done = (it > stopIter) or ( (it>1) and (abs(Jsur[-1]-Jsur[-2])<stopTol) ) return [it, J01, Jsur]