def set_activation(self, method, sig=None, d_sig=None, sig_0=None, d_sig_0=None):
		"""
		This method sets the activation functions. 

		Parameters
		----------
		method : str
			'logistic' , 'htanget', or 'custom'
		sig, d_sig, sig_0, and d_sig_0 : function objects
			Optional arguments intended for use with the 
			'custom' method option. They should be functions. 
			sig_0 and d_sig_0 are the output layer activation functions.
		"""
		method = method.lower()

		if method == 'logistic':
			self.sig = lambda z: twod(1 / (1 + np.exp(-z)))
			self.d_sig = lambda z: twod(np.multiply(self.sig(z), (1 - self.sig(z))))
			self.sig_0 = self.sig
			self.d_sig_0 = self.d_sig
		elif method == 'htangent':
			self.sig = lambda z: twod(np.tanh(z))
			self.d_sig = lambda z: twod(1 - np.power(np.tanh(z), 2))
			self.sig_0 = self.sig
			self.d_sig_0 = self.d_sig
		elif method == 'custom':
			self.sig = sig
			self.d_sig = d_sig
			self.sig_0 = sig_0
			self.d_sig_0 = d_sig_0
		else:
			raise ValueError('NNetClassify.set_activation: ' + str(method) + ' is not a valid option for method')

		self.activation = method
	def train(self, base, n, X, Y, *args, **kargs):
		"""
		Learn n new instances of base class. Refer to constructor docstring for
		descriptions of arguments.
		"""
		self.base = base

		N,D = twod(X).shape
		n_init = self.n_use
		step = 1

		if n_init == 0:
			# skip training, use constant predictor; set to local var
			self.const = np.mean(Y)								# (specialized to quadratic loss)

		y_hat = np.zeros(N) + self.const						# figure out current prediction value
		for i in range(n_init):									# if we already have learners...
			yi = self[i].predict(X).flatten()					# ...figure out prediction for the
			y_hat += (self.alpha[i] * yi)						# training data

		for i in range(n_init, n_init + n):
			Ri = (Y - y_hat) + 1e-64							# compute residuals (specialized to quadratic loss)
			self.ensemble.append(base(X, Ri, *args, **kargs))	# fit a model to the gradient residual
			yi = self[-1].predict(X)
			# minimize loss over alpha (specialized to quadratic loss)
			min_loss = step * np.divide((Ri.dot(yi)), (twod(yi).T.dot(yi)))
			self.alpha.append(min_loss.flatten()[0])			 
			y_hat = (twod(y_hat).T + self.alpha[-1] * yi).flatten()
			self.n_use += 1
def fsvd(X, K, T=None):
    """
    Reduce the dimensionality of X to K features using singular value 
    decomposition. 

    Parameters
    ----------
    X : numpy array
        M x N array of data.
    K : int
        Number of desired output features.
    T : numpy array (optional)
        Transform matrix. Including T will use T instead of computing the
        SVD.

    Returns
    -------
    Xsvd : numpy array
        N x K matrix of data.
    T : numpy array (optional)
        Transform matrix
    """
    n, m = twod(X).shape

    if type(T) is type(None):
        U, S, V = np.linalg.svd(X, full_matrices=False)  # compute SVD (Ihler uses svds here)
        U = U[:, :K]
        S = np.diag(S[:K])
        V = V.T[:, :K]
        Xsvd = U.dot(np.sqrt(S))  # new data coefficients
        T = np.sqrt(S[0:K, 0:K]).dot(twod(V).T)  # new bases for data
        return (Xsvd, T)

    Xsvd = np.divide(X, T)  # or, use given set of bases
    return Xsvd, T
Beispiel #4
0
    def setActivation(self, method, sig=None, sig0=None): 
        """
        This method sets the activation functions. 

        Parameters
        ----------
        method : string, {'logistic' , 'htangent', 'custom'} -- which activation type
        Optional arguments for "custom" activation:
        sig : function object F(z) returns activation function & its derivative at z (as a tuple)
        sig0: activation function object F(z) for final layer of the nnet
        """
        raise NotImplementedError  # unfinished / tested
        method = method.lower()

        if method == 'logistic':
            self.sig = lambda z: twod(1 / (1 + np.exp(-z)))
            self.d_sig = lambda z: twod(np.multiply(self.sig(z), (1 - self.sig(z))))
            self.sig_0 = self.sig
            self.d_sig_0 = self.d_sig
        elif method == 'htangent':
            self.sig = lambda z: twod(np.tanh(z))
            self.d_sig = lambda z: twod(1 - np.power(np.tanh(z), 2))
            self.sig_0 = self.sig
            self.d_sig_0 = self.d_sig
        elif method == 'custom':
            self.sig = sig
            self.d_sig = d_sig
            self.sig_0 = sig_0
            self.d_sig_0 = d_sig_0
        else:
            raise ValueError('nnetRegress.set_activation: ' + str(method) + ' is not a valid option for method')

        self.activation = method
def fkitchensink(X, K, typ, W=None):
    """
    Random kitchen sink features from data. Selects K random "kitchen sink"
    features of X. 

    Parameters
    ----------
    X : numpy array
        M x N numpy array containing data.
    K : int
        Number of features to select.
    typ : str
        One of: 'stump', 'sigmoid', 'sinuoid', or 'linear'.
    W : numpy array (optional)
        N x K array of parameters. If provided, W uses fixed params.

    Returns
    -------
    Z : numpy array
        M x K array of features selected from X.
    W : numpy array (optional)
        N x K array of random parameters. Only returned if the argument W
        isn't provided.
    """
    to_return = ()

    N, M = twod(X).shape
    typ = typ.lower()

    if type(W) is type(None):  # numpy complains about truth value of arrays
        if typ == "stump":
            W = np.zeros((2, K))
            s = np.sqrt(np.var(X, axis=0))
            # random feature index 1..M
            W[0, :] = np.floor(np.random.rand(K) * M)
            W = W.astype(int)
            W[0, :] = W[0, :].astype(int)
            W[1, :] = np.random.randn(K) * s[W[0, :]]  # random threshold (w/ same variance as that feature)
        elif typ in ["sigmoid", "sinusoid", "linear"]:
            # random direction for sigmodal ridge, random freq for sinusoids, random linear projections
            W = np.random.randn(M, K)

        to_return = (W,)

    Z = np.zeros((N, K))

    if typ == "stump":  # decision stump w/ random threshold
        for i in range(K):
            Z[:, i] = X[:, W[0, i]] >= W[1, i]
    elif typ == "sigmoid":  # sigmoidal ridge w/ random direction
        Z = twod(X).dot(W)
        Z = 1 / (1 + np.exp(Z))
    elif typ == "sinusoid":  # sinusoid w/ random frequency
        Z = np.sin(twod(X).dot(W))
    elif typ == "linear":  # straight linear projection
        Z = twod(X).dot(W)

    return Z if len(to_return) == 0 else (Z,) + to_return
	def log_likelihood(self, X, Y):
		"""
		Compute the emperical avg. log likelihood of 'obj' on test data (X,Y).
		See constructor doc string for argument descriptions.
		"""
		r,c = twod(Y).shape
		if r == 1 and c != 1:
			Y = twod(Y).T

		soft = self.predict_soft(X)
		return np.mean(np.sum(np.log(np.power(soft, Y, )), 1), 0)
	def __logistic(self, X):
		"""
		This is a helper method that evaluates the logistic function
		for weights self.wts (1 x d + 1) on data X (n x d). Used in:
			__gradient_descent
			predict
		"""
		n,d = twod(X).shape

		X_train = cols((np.ones((n,1)), twod(X)))

		f = twod(X_train).dot(twod(self.wts).T)
		return 1 / (1 + np.exp(-f))
def plot_classify_2D(learner, X, Y, pre=lambda x: x):
	"""
	Plot data and classifier outputs on two-dimensional data.
	This function plot data (X,Y) and learner.predict(X, Y) 
	together. The learner is is predicted on a dense grid
	covering data X, to show its decision boundary.

	Parameters
	----------
	learner : learner object
		A trained learner object that inherits from one of
		the 'Classify' or 'Regressor' base classes.
	X : numpy array
		N x M array of data; N = number of data, M = dimension
		(number of features) of data.
	Y : numpy array
		1 x N array containing labels corresponding to data points
		in X.
	pre : function object (optional)
		Function that is applied to X before prediction.
	"""
	if twod(X).shape[1] != 2:
		raise ValueError('plot_classify_2d: function can only be called using two-dimensional data (features)')

	plt.plot(X[:,0], X[:,1], 'k.')
	ax = plt.xlim() + plt.ylim()					# get current axis limits
	N = 256											# density of evaluation

	# evaluate each point of feature space and predict the class
	X1 = np.linspace(ax[0], ax[1], N)
	X1sp = twod(X1).T * np.ones(N)
	X2 = np.linspace(ax[2], ax[3], N)
	X2sp = np.ones((N,1)) * X2
	
	Xfeat = cols((twod(X1sp.flatten()).T, twod(X2sp.flatten()).T))

	# preprocess/create feature vector if necessary
	Xfeat = pre(Xfeat)

	# predict using learner
	pred = learner.predict(Xfeat)

	# plot decision values for space in 'faded' color
	clim = np.unique(Y)
	clim = [clim[0], clim[0] + 1] if len(clim) == 1 else list(clim)
	plt.imshow(np.reshape(pred, (N,N)).T, extent=[X1[0], X1[-1], X2[0], X2[-1]], cmap=plt.cm.Pastel2)
	plt.clim(*clim)

	plt.show()
def flda(X, Y, K, T=None):
    """
    Reduce the dimension of X to K features using (multiclass) discriminant
    analysis.

    Parameters
    ----------
    X : numpy array
        M x N array of data.
    Y : numpy array
        M x 1 array of labels corresponding to data in X.
    K : int
        New dimension (number of features) of X.
    T : numpy array (optional)
        The transform matrix. If this argument is provided, function uses T
        instead of computing the LDA.

    Returns
    -------
    Xlda : numpy array
    T : numpy array (optional)

    TODO: Test; check/test Matlab version?
    """
    if type(T) is not type(None):
        return np.divide(X, T)

    n, m = twod(X).shape

    c = np.unique(Y)
    nc = np.zeros(len(c))
    mu = np.zeros((len(c), n))
    sig = np.zeros((len(c), n, n))

    for i in range(len(c)):
        idx = np.where(Y == c[i])[0]
        nc[i] = len(idx)
        mu[i, :] = np.mean(X[:, idx], axis=0)
        sig[i, :, :] = np.cov(X[:, idx])

    S = (nc / n).dot(np.reshape(sig, (len(c), n * n)))
    S = np.reshape(S, (n, n))

    U, S, V = np.linalg.svd(X, K)  # compute SVD (Ihler uses svds here)
    Xlda = U.dot(np.sqrt(S))  # new data coefficients
    T = np.sqrt(S[0:K, 0:K]).dot(twod(V).T)  # new bases for data

    return Xlda, T
Beispiel #10
0
def bootstrapData(X, Y=None, n_boot=None):
    """
    Bootstrap resample a data set (with replacement): 
    draw data points (x_i,y_i) from (X,Y) n_boot times.

    Parameters
    ----------
    X : MxN numpy array of data points to be resampled.
    Y : Mx1 numpy array of labels associated with each datum (optional)
    n_boot : int, number of samples to draw (default: N)

    Returns
    -------
    Xboot, Yboot : (tuple of) numpy arrays for the resampled data set
    If Y is not present or None, returns only Xboot (non-tuple)
    """
    nx,dx = twod(X).shape
    Y = Y.flatten()
    if n_boot is None: n_boot = nx

    idx = np.floor(np.random.rand(n_boot) * nx).astype(int)
    X = X[idx,:]

    ny = len(Y)
    assert ny > 0, 'bootstrapData: Y must contain data'
    assert nx == ny, 'bootstrapData: X and Y should have the same length'
    Y = Y[idx]

    return (X,Y)
def shuffle_data(X, Y):
	"""
	Shuffle data in X and Y.

	Parameters
	----------
	X : numpy array
		N x M array of data to shuffle.
	Y : numpy arra
		1 x N array of labels that correspond to data in X.

	Returns
	-------
	X or (X,Y) : numpy array or tuple of arrays
		Shuffled data (only returns X and Y if Y contains data).
	
	TODO: test more
	"""
	nx,dx = twod(X).shape
	Y = arr(Y).flatten()
	ny = len(Y)

	pi = np.random.permutation(nx)
	X = X[pi,:]

	if ny > 0:
		assert ny == nx, 'shuffle_data: X and Y must have the same length'
		Y = Y[pi]
		return X,Y

	return X
def bootstrap_data(X, Y, n_boot):
	"""
	Function that resamples (bootstrap) data set: it resamples 
	data points (x_i,y_i) with replacement n_boot times.

	Parameters
	----------
	X : numpy array
		N x M numpy array that contains data points to be sampled.
	Y : numpy array
		1 x N numpy arra that contains labels that map to data 
		points in X.
	n_boot : int
		The number of samples to take.

	Returns
	-------
	(array,array)
		Tuple containing samples from X and Y.

	TODO: test more
	"""
	Y = Y.flatten()

	nx,dx = twod(X).shape
	idx = np.floor(np.random.rand(n_boot) * nx).astype(int)
	X = X[idx,:]

	ny = len(Y)
	assert ny > 0, 'bootstrap_data: Y must contain data'
	assert nx == ny, 'bootstrap_data: X and Y should have the same length'
	Y = Y[idx]

	return (X,Y)
Beispiel #13
0
def splitData(X, Y=None, train_fraction=0.80):
    """
    Split data into training and test data.

    Parameters
    ----------
    X : MxN numpy array of data to split
    Y : Mx1 numpy array of associated target values
    train_fraction : float, fraction of data used for training (default 80%)

    Returns
    -------
    to_return : (Xtr,Xte,Ytr,Yte) or (Xtr,Xte)
        A tuple containing the following arrays (in order): training
        data from X, testing data from X, training labels from Y
        (if Y contains data), and testing labels from Y (if Y 
        contains data).
    """
    nx,dx = twod(X).shape
    ne = round(train_fraction * nx)

    Xtr,Xte = X[:ne,:], X[ne:,:]
    to_return = (Xtr,Xte)

    if Y is not None:
        Y = arr(Y).flatten()
        ny = len(Y)
        if ny > 0:
            assert ny == nx, 'splitData: X and Y must have the same length'
            Ytr,Yte = Y[:ne], Y[ne:]
            to_return += (Ytr,Yte)

    return to_return
Beispiel #14
0
def fsubset(X, K, feat=None):
    """
    Select subset of features from data. Selects a fixed or random subset
    of K features from X.

    Parameters
    ----------
    X : numpy array
        M x N array of data.
    K : int
        Number of features in output.
    feat : array like (optional)
        Flat array of indices specifying which features to select.

    Returns
    -------
    X_sub : numpy array
        M x K numpy array of data.
    feat : numpy array (optional)
        1 x N array of indices of selected features. Only returned if feat
        argument isn't provided.
    """
    n, m = twod(X).shape

    to_return = ()
    if type(feat) is type(None):
        feat = np.random.permutation(m)
        feat = feat[0:K]
        to_return = (feat,)

    X_sub = X[:, feat]
    return X_sub if len(to_return) == 0 else (X_sub,) + to_return
Beispiel #15
0
def shuffleData(X, Y=None):
    """
    Shuffle (randomly reorder) data in X and Y.

    Parameters
    ----------
    X : MxN numpy array: N feature values for each of M data points
    Y : Mx1 numpy array (optional): target values associated with each data point

    Returns
    -------
    X,Y  :  (tuple of) numpy arrays of shuffled features and targets
            only returns X (not a tuple) if Y is not present or None
    
    Ex:
    X2    = shuffleData(X)   : shuffles the rows of the data matrix X
    X2,Y2 = shuffleData(X,Y) : shuffles rows of X,Y, preserving correspondence
    """
    nx,dx = twod(X).shape
    Y = arr(Y).flatten()
    ny = len(Y)

    pi = np.random.permutation(nx)
    X = X[pi,:]

    if ny > 0:
        assert ny == nx, 'shuffleData: X and Y must have the same length'
        Y = Y[pi] if Y.ndim <= 1 else Y[pi,:]
        return X,Y

    return X
Beispiel #16
0
def fproject(X, K, proj=None):
    """
    Random projection of features from data. Selects a fixed or random linear
    projection of K features from X.

    Parameters
    ----------
    X : numpy array
        M x N array of data.
    K : int
        Number of features in output.
    proj : numpy array (optional)
        The projection matrix. If this argument is provided, function uses proj
        instead of random matrix.

    Returns
    -------
    X : numpy array
        M x K array of projecjtion of data in X.
    proj : numpy array (optional)
        N x K numpy array that is the project matrix. Only returned if proj 
        argument isn't provided.
    """
    n, m = twod(X).shape

    to_return = ()
    if type(proj) is type(None):
        proj = np.random.randn(m, K)
        to_return = (proj,)

    X2 = X.dot(proj)

    return X2 if len(to_return) == 0 else (X2,) + to_return
Beispiel #17
0
def fpoly_mono(X, degree, bias=True):
    """
    Create polynomial features of each individual feature (no cross products).

    Parameters
    ----------
    X : MxN numpy array of data (each row one data point)
    degree : int, the polynomial degree
    bias : bool, include constant feature if true (default)

    Returns
    -------
    Xext : MxN' numpy array with each data point's higher order features
    """
    m, n = twod(X).shape

    if bias:
        Xext = np.zeros((m, n * degree + 1))
        Xext[:, 0] = 1
        k = 1
    else:
        Xext = np.zeros((m, n * degree))
        k = 0

    for p in range(degree):
        for j in range(n):
            Xext[:, k] = np.power(X[:, j], p + 1)
            k += 1

    return Xext
def kmeans(X, K, init='random', max_iter=100, do_plot=False, to_return=[1, 0, 0]):
	"""
	Perform K-means clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters),
		'farthest' (choose cluster 1 uniformly, then the point farthest
		from all cluster so far, etc.), or 'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of optimization iterations.
	do_plot : bool (optional)
		Plot 2D data?
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether c should be returned, etc.

	Returns
	-------
	z : numpy array
		N x 1 array containing cluster numbers of data at indices in X.
	c : numpy array (optional)
		K x M array of cluster centers.
	sumd : scalar (optional)
		Sum of squared euclidean distances.
		
	TODO: test more
	"""
	n,d = twod(X).shape							# get data size

	if type(init) is str:
		init = init.lower()
		if init == 'random':
			pi = np.random.permutation(n)
			c = X[pi[0:K],:]
		elif init == 'farthest':
			c = k_init(X, K, True)
		elif init == 'k++':
			c = k_init(X, K, False)
		else:
			raise ValueError('kmeans: value for "init" ( ' + init +  ') is invalid')
	else:
		c = init

	z,c,sumd = __optimize(X, n, K, c,  max_iter)

	return optional_return(to_return, z - 1, c, sumd)
Beispiel #19
0
def checkDataShape(X,Y):
    """
    Simple helper function to convert vectors to matrices and check the shape of
    the data matrices X,Y
    """
    X = twod(X).T if X.ndim < 2 else X
    #Y = twod(Y).T if Y.ndim < 2 else Y
    if X.shape[0] != Y.shape[0]:
        raise ValueError("X and Y do not have the same number of data points!")
    return X,Y
def data_gauss(N0, N1=None, mu0=arr([0, 0]), mu1=arr([1, 1]), sig0=np.eye(2), sig1=np.eye(2)):
	"""
	Sample data from a Gaussian model.  	

	Parameters
	----------
	N0 : int
		Number of data to sample for class -1.
	N1 : int
		Number of data to sample for class 1.
	mu0 : numpy array
	mu1 : numpy array
	sig0 : numpy array
	sig1 : numpy array

	Returns
	-------
	X : numpy array
		Array of sampled data.
	Y : numpy array
		Array of class values that correspond to the data points in X.

	TODO: test more
	"""
	if not N1:
		N1 = N0

	d1,d2 = twod(mu0).shape[1],twod(mu1).shape[1]
	if d1 != d2 or np.any(twod(sig0).shape != arr([d1, d1])) or np.any(twod(sig1).shape != arr([d1, d1])):
		raise ValueError('data_gauss: dimensions should agree')

	X0 = np.dot(np.random.randn(N0, d1), sqrtm(sig0))
	X0 += np.ones((N0,1)) * mu0
	Y0 = -np.ones(N0)

	X1 = np.dot(np.random.randn(N1, d1), sqrtm(sig1))
	X1 += np.ones((N1,1)) * mu1
	Y1 = np.ones(N1)

	X = np.row_stack((X0,X1))
	Y = np.concatenate((Y0,Y1))

	return X,Y
Beispiel #21
0
def plotGauss2D(mu,cov,*args,**kwargs):
    """
    Plot an ellipsoid indicating (one std deviation of) a 2D Gaussian distribution
    All additional arguments are passed into plot(.)
    """
    from scipy.linalg import sqrtm
    theta = np.linspace(0,2*np.pi,50)
    circle = np.array([np.sin(theta),np.cos(theta)])
    ell = sqrtm(cov).dot(circle)
    ell += twod(mu).T

    plt.plot( mu[0],mu[1], 'x', ell[0,:],ell[1,:], **kwargs)
	def predict(self, X):
		"""
		Predict on X. Refer to constructor docstring for description of X.
		"""
		N,D = twod(X).shape
		Y_te = np.zeros((N,1)) + self.const

		for i,l in enumerate(self):
			yi = l.predict(X)									# figure out current prediction value
			# if we already have learners, figure out the prediction on the training data
			Y_te = Y_te + self.alpha[i] * yi			

		return Y_te
Beispiel #23
0
  def roc(self, X, Y):
    """
    This method computes the "receiver operating characteristic" curve on
    test data.  This method is only defined for binary classifiers. Refer 
    to the auc doc string for descriptions of X and Y. Method returns
    (fpr, tpr, tnr), where
      fpr = false positive rate (1xN numpy vector)
      tpr = true positive rate (1xN numpy vector)
      tnr = true negative rate (1xN numpy vector)
    Plot fpr vs. tpr to see the ROC curve. 
    Plot tpr vs. tnr to see the sensitivity/specificity curve.
    """
    if len(self.classes) > 2:
      raise ValueError('This method can only supports binary classification ')

    try:                  # compute 'response' (soft binary classification score)
      soft = self.predictSoft(X)[:,1]  # p(class = 2nd)
    except (AttributeError, IndexError):
      soft = self.predict(X)        # or we can use 'hard' binary prediction if soft is unavailable
    n,d = twod(soft).shape

    if n == 1:
      soft = soft.flatten()
    else:
      soft = soft.T.flatten()

    # number of true negatives and positives
    n0 = np.sum(Y == self.classes[0])
    n1 = np.sum(Y == self.classes[1])

    if n0 == 0 or n1 == 0:
      raise ValueError('Data of both class values not found')

    # sort data by score value
    sorted_soft = np.sort(soft)
    indices = np.argsort(soft)

    Y = Y[indices]

    # compute false positives and true positive rates
    tpr = np.divide(np.cumsum(Y[::-1] == self.classes[1]), n1)
    fpr = np.divide(np.cumsum(Y[::-1] == self.classes[0]), n0)
    tnr = np.divide(np.cumsum(Y == self.classes[0]), n0)[::-1]

    # find ties in the sorting score
    same = np.append(np.asarray(sorted_soft[0:-1] == sorted_soft[1:]), 0)
    tpr = np.append([0], tpr[np.logical_not(same)])
    fpr = np.append([0], fpr[np.logical_not(same)])
    tnr = np.append([1], tnr[np.logical_not(same)])
    return [tpr, fpr, tnr]
Beispiel #24
0
def plotClassify2D(learner, X, Y, pre=lambda x: x, axis=None, nGrid=128, **kwargs):
    """
    Plot data and classifier outputs on two-dimensional data.
    This function plot data (X,Y) and learner.predict(X, Y) 
    together. The learner is is predicted on a dense grid
    covering data X, to show its decision boundary.

    Parameters
    ----------
    learner : learner object
        A trained learner object that inherits from one of
        the 'Classify' or 'Regressor' base classes.
    X : numpy array
        N x M array of data; N = number of data, M = dimension
        (number of features) of data.
    Y : numpy array
        1 x N arra containing labels corresponding to data points
        in X.
    pre : function object (optional)
        Function that is applied to X before prediction.
    axis  : a matplotlib axis / plottable object (optional)
    nGrid : density of 2D grid points (default 128)
    """

    if twod(X).shape[1] != 2:
        raise ValueError('plotClassify2D: function can only be called using two-dimensional data (features)')

    # TODO: Clean up code

    if axis == None: axis = plt 
    axis.plot( X[:,0],X[:,1], 'k.', visible=False )
    # TODO: can probably replace with final dot plot and use transparency for image (?)
    ax = axis.axis()
    xticks = np.linspace(ax[0],ax[1],nGrid)
    yticks = np.linspace(ax[2],ax[3],nGrid)
    grid = np.meshgrid( xticks, yticks )

    XGrid = np.column_stack( (grid[0].flatten(), grid[1].flatten()) )
    if learner is not None:
        YGrid = learner.predict( pre(XGrid) )
        #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses )
        axis.imshow( YGrid.reshape( (len(xticks),len(yticks)) ), extent=axis.axis(), interpolation='nearest',origin='lower',alpha=0.5 )
    cmap = plt.cm.get_cmap()
    # TODO: if Soft: predictSoft; get colors for each class from cmap; blend pred with colors & show
    #  
    classes = np.unique(Y)
    cvals = (classes - min(classes))/(max(classes)-min(classes)+1e-100)
    for i,c in enumerate(classes): 
        axis.plot( X[Y==c,0],X[Y==c,1], 'ko', color=cmap(cvals[i]), **kwargs )  
	def auc(self, X, Y):
		"""
		This method computes the area under the roc curve on the given test data.
		This method only works on binary classifiers. 

		Paramters
		---------
		X : N x M numpy array 
			N = number of data points; M = number of features. 
		Y : 1 x N numpy array 
			Array of classes that refer to the data points in X.
		"""
		if len(self.classes) > 2:
			raise ValueError('This method can only supports binary classification ')

		try:									# compute 'response' (soft binary classification score)
			soft = self.predict_soft(X)[:,1]	# p(class = 2nd)
		except (AttributeError, IndexError):	# or we can use 'hard' binary prediction if soft is unavailable
			soft = self.predict(X)

		n,d = twod(soft).shape

		if n == 1:
			soft = soft.flatten()
		else:
			soft = soft.T.flatten()

		sorted_soft = np.sort(soft)				# sort data by score value
		indices = np.argsort(soft)				
		Y = Y[indices]
		# find ties in the sorting score
		same = np.append(np.asarray(sorted_soft[0:-1] == sorted_soft[1:]), 0)

		n = len(soft)
		rnk = self.__compute_ties(n, same)		# compute tied rank values
		
		# number of true negatives and positives
		n0 = sum(Y == self.classes[0])
		n1 = sum(Y == self.classes[1])

		if n0 == 0 or n1 == 0:
			raise ValueError('Data of both class values not found')

		# compute AUC using Mann-Whitney U statistic
		result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1) / 2) / n1 / n0
		return result
Beispiel #26
0
def from1ofK(Y, values=None):
    """
    Function that converts Y from 1-of-K ("1-hot") rep back to single col/row form.

    Parameters
    ----------
    Y : arraylike
        Matrix to convert from 1-of-k rep.
    values : list (optional)
        List that specifies which values to use for which index.

    Returns
    -------
    array
        Y in single row/col form.
    """
    return Y.argmax(1) if not values else twod([values[i] for i in Y.argmax(1)]).T
    def train(self, X, Y, initStep=1.0, stopTol=1e-4, stopIter=5000, plot=None):
        """ Train the logistic regression using stochastic gradient descent """
        ## First do some bookkeeping and setup:
        self.theta,X,Y = twod(self.theta), arr(X), arr(Y)   # convert to numpy arrays
        M,N = X.shape
        if Y.shape[0] != M:
            raise ValueError("Y must have the same number of data (rows) as X")
        self.classes = np.unique(Y)
        if len(self.classes) != 2:
            raise ValueError("Y should have exactly two classes (binary problem expected)")
        if self.theta.shape[1] != N+1:         # if self.theta is empty, initialize it!
            self.theta = np.random.randn(1,N+1)
        # Some useful modifications of the data matrices:
        X1  = np.hstack((np.ones((M,1)),X))    # make data array with constant feature
        Y01 = toIndex(Y, self.classes)         # convert Y to canonical "0 vs 1" classes

        it   = 0
        done = False
        Jsur = []
        J01  = []
        while not done:
            step = (2.0 * initStep) / (2.0 + it)   # common 1/iter step size change

            for i in range(M):  # for each data point i:
                ## TODO: compute zi = linear response of X[i,:]
                ## TODO: compute prediction yi 
                ## TODO: compute soft response si = logistic( zi )
                ## TODO: compute gradient of logistic loss wrt data point i:
                
                # Take a step down the gradient
                self.theta = self.theta - step * gradi

            # each pass, compute surrogate loss & error rates:
            J01.append( self.err(X,Y) )
            ## TODO: compute surrogate loss (logistic negative log-likelihood)
            ##  Jsur = sum_i [ (si log si) if yi==1 else ((1-si)log(1-si)) ]
            Jsur.append( NotImplemented ) ## TODO ...
           
            ## For debugging: print current parameters & losses
            # print self.theta, ' => ', Jsur[-1], ' / ', J01[-1]  
            # raw_input()   # pause for keystroke

            # check stopping criteria:
            it += 1
            done = (it > stopIter) or ( (it>1) and (abs(Jsur[-1]-Jsur[-2])<stopTol) )
Beispiel #28
0
def from1ofK(Y, values=None):
    """
    Function that converts Y from 1-of-K ("1-hot") rep back to single col/row form.

    Parameters
    ----------
    Y : arraylike
        Matrix to convert from 1-of-k rep.
    values : list (optional)
        List that specifies which values to use for which index.

    Returns
    -------
    array
        Y in single row/col form.
    """
    return Y.argmax(1) if not values else twod(
        [values[i] for i in Y.argmax(1)]).T
def cross_validate(X, Y, n_folds, i_fold):
    """
	Function that splits data for n-fold cross validation.

	Parameters
	----------
	X : numpy array
		N x M numpy array that contains data points.
	Y : numpy array
		1 x N numpy array that contains labels that correspond to
		data points in X.
	n_folds : int
		Total number of data folds.
	i_fold : int
		The fold for which the current call of cross_validate
		will partition.

	Returns
	-------
	to_return : (Xtr,Xte,Ytr,Yte)
		Tuple that contains (in this order) training data from X, testing
		data from X, training labels from Y, and testing labels from Y.
	"""
    Y = arr(Y).flatten()

    nx, dx = twod(X).shape
    ny = len(Y)
    idx = range(nx)

    if ny > 0:
        assert nx == ny, 'cross_validate: X and Y must have the same length'

    n = np.fix(nx / n_folds)

    te_start = int((i_fold - 1) * n)
    te_end = int((i_fold * n)) if (i_fold * n) <= nx else int(nx)
    test_range = list(range(te_start, te_end))
    train_range = sorted(set(idx) - set(test_range))

    to_return = (X[train_range, :], X[test_range, :])
    if ny > 0:
        to_return += (Y[train_range], Y[test_range])

    return to_return
Beispiel #30
0
def k_init(X, K, determ):
	"""
	Distance based initialization. Randomly choose a start point, then:
	if determ == True: choose point farthest from the clusters chosen so
	far, otherwise: randomly choose new points proportionally to their
	distance.

	Parameters
	----------
	X : numpy array
		See kmeans docstring.
	K : int
		See kmeans docstring.
	determ : bool
		See description.

	Returns
	-------
	c : numpy array
		K x M array of cluster centers.
	"""
	m,n = twod(X).shape
	clusters = np.zeros((K,n))
	clusters[0,:] = X[np.floor(np.random.rand() * m),:]			# take random point as first cluster
	dist = np.sum(np.power((X - np.ones((m,1)) * clusters[0,:]), 2), axis=1).ravel()
	#print 'dist:',dist

	for i in range(1,K):
		#print dist
		#print np.argmax(dist)
		if determ:
			j = np.argmax(dist)									# choose farthest point...
		else:
			pr = np.cumsum(np.array(dist));								# ...or choose a random point by distance
			pr = pr / pr[-1]
			j = np.where(np.random.rand() < pr)[0][0]

		clusters[i,:] = X[j,:]									# update that cluster
		# update min distances
		new_dist = np.sum(np.power((X - np.ones((m,1)) * clusters[i,:]), 2), axis=1).ravel()
		dist = np.minimum(dist, new_dist)
		#print "dist",dist

	return clusters
def cross_validate(X, Y, n_folds, i_fold):
    """
	Function that splits data for n-fold cross validation.

	Parameters
	----------
	X : numpy array
		N x M numpy array that contains data points.
	Y : numpy array
		1 x N numpy array that contains labels that correspond to
		data points in X.
	n_folds : int
		Total number of data folds.
	i_fold : int
		The fold for which the current call of cross_validate
		will partition.

	Returns
	-------
	to_return : (Xtr,Xte,Ytr,Yte)
		Tuple that contains (in this order) training data from X, testing
		data from X, training labels from Y, and testing labels from Y.
	"""
    Y = arr(Y).flatten()

    nx, dx = twod(X).shape
    ny = len(Y)
    idx = range(nx)

    if ny > 0:
        assert nx == ny, "cross_validate: X and Y must have the same length"

    n = np.fix(nx / n_folds)

    te_start = int((i_fold - 1) * n)
    te_end = int((i_fold * n)) if (i_fold * n) <= nx else int(nx)
    test_range = list(range(te_start, te_end))
    train_range = sorted(set(idx) - set(test_range))

    to_return = (X[train_range, :], X[test_range, :])
    if ny > 0:
        to_return += (Y[train_range], Y[test_range])

    return to_return
Beispiel #32
0
def k_init(X, K, determ):
	"""
	Distance based initialization. Randomly choose a start point, then:
	if determ == True: choose point farthest from the clusters chosen so
	far, otherwise: randomly choose new points proportionally to their
	distance.

	Parameters
	----------
	X : numpy array
		See kmeans docstring.
	K : int
		See kmeans docstring.
	determ : bool
		See description.

	Returns
	-------
	c : numpy array
		K x M array of cluster centers.
	"""
	m,n = twod(X).shape
	clusters = np.zeros((K,n))
	clusters[0,:] = X[int(np.floor(np.random.rand()) * m),:]			# take random point as first cluster
	dist = np.sum(np.power((X - np.ones((m,1)) * clusters[0,:]), 2), axis=1).ravel()
	#print 'dist:',dist

	for i in range(1,K):
		#print dist
		#print np.argmax(dist)
		if determ:
			j = np.argmax(dist)									# choose farthest point...
		else:
			pr = np.cumsum(np.array(dist));								# ...or choose a random point by distance
			pr = pr / pr[-1]
			j = np.where(np.random.rand() < pr)[0][0]

		clusters[i,:] = X[j,:]									# update that cluster
		# update min distances
		new_dist = np.sum(np.power((X - np.ones((m,1)) * clusters[i,:]), 2), axis=1).ravel()
		dist = np.minimum(dist, new_dist)
		#print "dist",dist

	return clusters
Beispiel #33
0
  def auc(self, X, Y):
    """
    This method computes the area under the roc curve on the given test data.
    This method only works on binary classifiers. 

    Parameters
    ---------
    X : M x N numpy array 
      M = number of data points; N = number of features. 
    Y : M x 1 numpy array 
      Array of classes (targets) corresponding to the data points in X.
    """
    if len(self.classes) != 2:
      raise ValueError('This method can only supports binary classification ')

    try:                  # compute 'response' (soft binary classification score)
      soft = self.predictSoft(X)[:,1]  # p(class = 2nd)
    except (AttributeError, IndexError):  # or we can use 'hard' binary prediction if soft is unavailable
      soft = self.predict(X)

    n,d = twod(soft).shape             # ensure soft is the correct shape
    soft = soft.flatten() if n==1 else soft.T.flatten()

    indices = np.argsort(soft)         # sort data by score value
    Y = Y[indices]
    sorted_soft = soft[indices]

    # compute rank (averaged for ties) of sorted data
    dif = np.hstack( ([True],np.diff(sorted_soft),[True]) )
    r1  = np.argwhere(dif).flatten()
    r2  = r1[0:-1] + 0.5*(r1[1:]-r1[0:-1]) + 0.5
    rnk = r2[np.cumsum(dif[:-1])-1]

    # number of true negatives and positives
    n0,n1 = sum(Y == self.classes[0]), sum(Y == self.classes[1])

    if n0 == 0 or n1 == 0:
      raise ValueError('Data of both class values not found')

    # compute AUC using Mann-Whitney U statistic
    result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1) / 2) / n1 / n0
    return result
Beispiel #34
0
    def auc(self, X, Y):
        """Compute the area under the roc curve on the given test data.

        Args:
                X (arr): M,N array of M data points with N features each
                Y (arr): M, or M,1 array of target class values for each data point

        Returns:
                float: Area under the ROC curve

        This method only works on binary classifiers.
        """
        if len(self.classes) != 2:
            raise ValueError('This method can only supports binary classification ')

        try:                                    # compute 'response' (soft binary classification score)
            soft = self.predictSoft(X)[:,1]    # p(class = 2nd)
        except (AttributeError, IndexError):    # or we can use 'hard' binary prediction if soft is unavailable
            soft = self.predict(X)

        n,d = twod(soft).shape                         # ensure soft is the correct shape
        soft = soft.flatten() if n==1 else soft.T.flatten()

        indices = np.argsort(soft)                 # sort data by score value
        Y = Y[indices]
        sorted_soft = soft[indices]

        # compute rank (averaged for ties) of sorted data
        dif = np.hstack( ([True],np.diff(sorted_soft)!=0,[True]) )
        r1 = np.argwhere(dif).flatten()
        r2 = r1[0:-1] + 0.5*(r1[1:]-r1[0:-1]) + 0.5
        rnk = r2[np.cumsum(dif[:-1])-1]

        # number of true negatives and positives
        n0,n1 = sum(Y == self.classes[0]), sum(Y == self.classes[1])

        if n0 == 0 or n1 == 0:
            raise ValueError('Data of both class values not found')

        # compute AUC using Mann-Whitney U statistic
        result = (np.sum(rnk[Y == self.classes[1]]) - n1 * (n1 + 1.0) / 2.0) / n1 / n0
        return result
Beispiel #35
0
    def predictSoft(self, X):
        """
        This method makes a "soft" linear classification predition on the data
        Uses a (multi)-logistic function to convert linear response to [0,1] confidence

        Parameters
        ----------
        X : M x N numpy array 
            M = number of testing instances; N = number of features.  
        """
        theta, X = twod(self.theta), arr(X)  # convert to numpy if needed
        resp = theta[:, 0].T + X.dot(theta[:, 1:].T)  # linear response (MxC)
        prob = np.exp(resp)
        if resp.shape[1] == 1:  # binary classification (C=1)
            prob /= prob + 1.0  # logistic transform (binary classification; C=1)
            prob = np.hstack((1 - prob, prob))  # make a column for each class
        else:
            prob /= np.sum(prob,
                           axis=1)  # normalize each row (for multi-class)

        return prob
Beispiel #36
0
def crossValidate(X, Y=None, K=5, i=0):
    """
    Create a K-fold cross-validation split of a data set:
    crossValidate(X,Y, 5, i) : return the ith of 5 80/20 train/test splits

    Parameters
    ----------
    X : MxN numpy array of data points to be resampled.
    Y : Mx1 numpy array of labels associated with each datum (optional)
    K : number of folds of cross-validation
    i : current fold to return (0...K-1)

    Returns
    -------
    Xtr,Xva,Ytr,Yva : (tuple of) numpy arrays for the split data set
    If Y is not present or None, returns only Xtr,Xva
    """
    nx, dx = twod(X).shape
    start = round(float(nx) * i / K)
    end = round(float(nx) * (i + 1) / K)

    Xte = X[start:end, :]
    Xtr = np.vstack((X[0:start, :], X[end:, :]))
    to_return = (Xtr, Xte)

    Y = arr(Y).flatten()
    ny = len(Y)

    if ny > 0:
        assert ny == nx, 'crossValidate: X and Y must have the same length'
        if Y.ndim <= 1:
            Yte = Y[start:end]
            Ytr = np.hstack((Y[0:start], Y[end:]))
        else:  # in case targets are multivariate
            Yte = Y[start:end, :]
            Ytr = np.vstack((Y[0:start, :], Y[end:, :]))
        to_return += (Ytr, Yte)

    return to_return
def fpoly(X, degree, bias=True):
    """
    Create expanded polynomial features of up to a given degree.

    Parameters
    ----------
    X : MxN numpy array of data (each row one data point)
    degree : int, the polynomial degree
    bias : bool, include constant feature if true (default)

    Returns
    -------
    Xext : MxN' numpy array with each data point's higher order features
    """
    n, m = twod(X).shape

    if (degree + 1)**(m) > 1e7:
        err_string = 'fpoly: {}**{} = too many potential output features'.format(
            degree + 1, m)
        raise ValueError(err_string)

    if m == 1:  # faster shortcut for scalar data
        p = arr(range(0, degree + 1))
        Xext = np.power(np.tile(X, (1, len(p))), np.tile(p, (n, 1)))
    else:
        K = 0
        for i in range((degree + 1)**(m)):
            powers = np.unravel_index(i, (degree + 1, ) * m)
            if sum(powers) > degree: continue
            K += 1
        Xext = np.zeros((n, K))
        k = 0
        for i in range((degree + 1)**(m)):
            powers = np.unravel_index(i, (degree + 1, ) * m)
            if sum(powers) > degree: continue
            Xext[:, k] = np.prod(X**list(powers), axis=1)
            k += 1

    return Xext if bias else Xext[:, 1:]
Beispiel #38
0
def bootstrapData(X, Y=None, n_boot=None):
    """
    Bootstrap resample a data set (with replacement): 
    draw data points (x_i,y_i) from (X,Y) n_boot times.

    Parameters
    ----------
    X : MxN numpy array of data points to be resampled.
    Y : Mx1 numpy array of labels associated with each datum (optional)
    n_boot : int, number of samples to draw (default: M)

    Returns
    -------
    Xboot, Yboot : (tuple of) numpy arrays for the resampled data set
    If Y is not present or None, returns only Xboot (non-tuple)
    """
    nx, dx = twod(X).shape
    if n_boot is None: n_boot = nx
    idx = np.floor(np.random.rand(n_boot) * nx).astype(int)
    if Y is None: return X[idx, :]
    Y = Y.flatten()
    assert nx == len(Y), 'bootstrapData: X and Y should have the same length'
    return (X[idx, :], Y[idx])
Beispiel #39
0
def fpoly_mono(X, degree, use_constant=True):
    """
	Create polynomial features of each individual feature (no cross products).

	Parameters
	----------
	X : numpy array
		N x M array of data.
	degree : int
		The degree.
	use_constant : bool (optional)
		If True (default), include a constant feature.

	Returns
	-------
	Xext : numpy array
		N x M * degree (+ 1 if use_constant) array of polynomial
		features from X.
		
	TODO: test more
	"""
    m, n = twod(X).shape

    if use_constant:
        Xext = np.zeros((m, n * degree + 1))
        Xext[:, 0] = 1
        k = 1
    else:
        Xext = np.zeros((m, n * degree))
        k = 0

    for p in range(degree):
        for j in range(n):
            Xext[:, k] = np.power(X[:, j], p + 1)
            k += 1

    return Xext
Beispiel #40
0
def gmm_draw_params(m, c, n=2, scale=.05):
    """Create a random Gaussian mixture model.  

    Builds a random GMM with C components and draws M data x^{(i)} from a mixture
    of Gaussians in D dimensions

    Args:
	    m (int): Number of data to be drawn from a mixture of Gaussians.
	    c (int): Number of clusters.
	    n (int): Number of dimensions.
            scale (float): relative scale of the inter- to intra-cluster variance (small = clumpy)

    Returns:
       tuple of tuples, (pi,mu,sig) = mixture weight, mean, and covariance of each component
    """
    pi = np.zeros(c)
    for cc in range(c):
        pi[cc] = gamrand(10, 0.5)
    pi = pi / np.sum(pi)

    rho = np.random.rand(n, n)
    rho = rho + twod(rho).T
    rho = rho + n * np.eye(n)
    rho = sqrtm(rho)

    mu = np.random.randn(c, n).dot(rho)

    ccov = []
    for i in range(c):
        tmp = np.random.rand(n, n)
        tmp = tmp + tmp.T
        tmp = scale * (tmp + n * np.eye(n))
        ccov.append(tmp)
    #print(pi,mu,ccov)

    return tuple((pi[cc], mu[cc], ccov[cc]) for cc in range(c))
def split_data(X, Y, train_fraction):
    """
	Split data into training and test data.

	Parameters
	----------
	X : numpy array
		N x M array of data to split.
	Y : numpy arra
		1 x N array of labels that correspond to data in X.
	train_fraction : float
		Fraction of data to use for training.

	Returns
	-------
	to_return : (Xtr,Xte,Ytr,Yte) or (Xtr,Xte)
		A tuple containing the following arrays (in order): training
		data from X, testing data from X, training labels from Y
		(if Y contains data), and testing labels from Y (if Y 
		contains data).
	"""
    nx, dx = twod(X).shape
    ne = round(train_fraction * nx)

    Xtr, Xte = X[:ne, :], X[ne:, :]
    to_return = (Xtr, Xte)

    Y = arr(Y).flatten()
    ny = len(Y)

    if ny > 0:
        assert ny == nx, 'split_data: X and Y must have the same length'
        Ytr, Yte = Y[:ne], Y[ne:]
        to_return += (Ytr, Yte)

    return to_return
def fhash(X, K, hash=None):
    """
    Random hash of features from data. Selects a fixed or random hash of features
    from X.

    Parameters
    ----------
    X : numpy array
        M x N numpy array containing data.
    K : int
        Number of features to select.
    hash : function object (optional)
        Hash function to use. If provided, 'hash' uses fixed hash.

    Returns
    -------
    Z : numpy array
        M x K array of hashed features of X.
    hash : hash function (optional)
        Hash function used to hash features. Only returned if 'hash' argument
        isn't provided.
    """
    to_return = ()

    n, m = twod(X).shape

    if hash is None:  # TODO : not what we want ?!
        hash = lambda i: np.floor(np.random.rand(m) * K)[i]
        to_return = (hash, )

    # do the hashing
    Z = np.zeros((n, K))
    for i in range(m):
        Z[:, hash(i)] = Z[:, hash(i)] + X[:, i]

    return Z if len(to_return) == 0 else (Z, ) + to_return
Beispiel #43
0
def agglomerative(X, K, method='means', join=None):
    """
	Perform hierarchical agglomerative clustering.

	Parameters
	----------
	X : numpy array
		N x M array of Data to be clustered.
	K : int
		The number of clusters into which data should be grouped.
	method : str (optional)
		str that specifies the method to use for calculating distance between
		clusters. Can be one of: 'min', 'max', 'means', or 'average'.
	join : numpy array (optional)
		N - 1 x 3 that contains a sequence of joining operations. Pass to avoid
		reclustering for new X.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether join should be returned.

	Returns (tuple)
	-------
	z    : N x 1 array of cluster assignments.
	join : N - 1 x 3 array that contains the sequence of joining operations 
		peformed by the clustering algorithm.
	"""
    m, n = twod(X).shape  # get data size
    D = np.zeros(
        (m, m)
    ) + np.inf  # store pairwise distances b/w clusters (D is an upper triangular matrix)
    z = arr(range(m))  # assignments of data
    num = np.ones(m)  # number of data in each cluster
    mu = arr(X)  # centroid of each cluster
    method = method.lower()

    if type(join) == type(None):  # if join not precomputed

        join = np.zeros((m - 1, 3))  # keep track of join sequence
        # use standard Euclidean distance
        dist = lambda a, b: np.sum(np.power(a - b, 2))
        for i in range(m):  # compute initial distances
            for j in range(i + 1, m):
                D[i][j] = dist(X[i, :], X[j, :])

        opn = np.ones(m)  # store list of clusters still in consideration
        val, k = np.min(D), np.argmin(
            D)  # find first join (closest cluster pair)

        for c in range(m - 1):
            i, j = np.unravel_index(k, D.shape)
            join[c, :] = arr([i, j, val])

            # centroid of new cluster
            mu_new = (num[i] * mu[i, :] + num[j] * mu[j, :]) / (num[i] +
                                                                num[j])

            # compute new distances to cluster i
            for jj in np.where(opn)[0]:
                if jj in [i, j]:
                    continue

                # sort indices because D is an upper triangluar matrix
                idxi = tuple(sorted((i, jj)))
                idxj = tuple(sorted((j, jj)))

                if method == 'min':
                    D[idxi] = min(D[idxi],
                                  D[idxj])  # single linkage (min dist)
                elif method == 'max':
                    D[idxi] = max(D[idxi],
                                  D[idxj])  # complete linkage (max dist)
                elif method == 'means':
                    D[idxi] = dist(
                        mu_new, mu[jj, :])  # mean linkage (dist b/w centroids)
                elif method == 'average':
                    # average linkage
                    D[idxi] = (num[i] * D[idxi] + num[j] * D[idxj]) / (num[i] +
                                                                       num[j])

            opn[j] = 0  # close cluster j (fold into i)
            num[i] = num[i] + num[
                j]  # update total membership in cluster i to include j
            mu[i, :] = mu_new  # update centroid list

            # remove cluster j from consideration as min
            for ii in range(m):
                if ii != j:
                    # sort indices because D is an upper triangular matrix
                    idx = tuple(sorted((ii, j)))
                    D[idx] = np.inf

            val, k = np.min(D), np.argmin(D)  # find next smallext pair

    # compute cluster assignments given sequence of joins
    for c in range(m - K):
        z[z == join[c, 1]] = join[c, 0]

    uniq = np.unique(z)
    for c in range(len(uniq)):
        z[z == uniq[c]] = c

    return z, join
Beispiel #44
0
def kmeans(X, K, init='random', max_iter=100):
    """
	Perform K-means clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters),
		'farthest' (choose cluster 1 uniformly, then the point farthest
		from all cluster so far, etc.), or 'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of optimization iterations.

	Returns (as tuple)
	-------
	z    : N x 1 array containing cluster numbers of data at indices in X.
	c    : K x M array of cluster centers.
	sumd : (scalar) sum of squared euclidean distances.
	"""
    n, d = twod(X).shape

    # First, initialize the clusters to something:
    if type(init) is str:
        init = init.lower()
        if init == 'random':
            pi = np.random.permutation(n)
            c = X[pi[0:K], :]
        elif init == 'farthest':
            c = k_init(X, K, True)
        elif init == 'k++':
            c = k_init(X, K, False)
        else:
            raise ValueError('kmeans: value for "init" ( ' + init +
                             ') is invalid')
    else:
        c = init

    # Now, optimize the objective using coordinate descent:
    iter = 1
    done = (iter > max_iter)
    sumd = np.inf
    sum_old = np.inf

    z = np.zeros((n, ))
    #print c

    while not done:
        sumd = 0

        for i in range(n):
            # compute distances for each cluster center
            dists = np.sum((c - twod(X[i, :]))**2, axis=1)
            #dists = np.sum(np.power((c - np.tile(X[i,:], (K,1))), 2), axis=1)
            val = np.min(dists, axis=0)  # assign datum i to nearest cluster
            z[i] = np.argmin(dists, axis=0)
            sumd = sumd + val

        #print z
        for j in range(K):  # now update each cluster center j...
            if np.any(z == j):
                c[j, :] = np.mean(
                    X[(z == j).flatten(), :],
                    0)  # ...to be the mean of the assigned data...
            else:
                c[j, :] = X[int(np.floor(np.random.rand(
                ))), :]  # ...or random restart if no assigned data

        done = (iter > max_iter) or (sumd == sum_old)
        sum_old = sumd
        iter += 1

    return z, c, sumd
Beispiel #45
0
def fpoly_pair(X, degree, use_constant=True):
    """
	Create polynomial features of each individual feature (too many cross 
	products).

	Parameters
	----------
	X : numpy array
		N x M array of data.
	degree : int
		The degree.
	use_constant : bool (optional)
		If True (default), include a constant feature.

	Returns
	-------
	Xext : numpy array

	TODO: test more
	"""
    m, n = twod(X).shape

    npoly = np.ceil(
        (n**(degree + 1) - 1) / (n - 1))  # ceil to fix possible roundoff error
    if use_constant:
        Xext = np.zeros((m, npoly))
        Xext[:, 0] = 1
        Xcur = 1
        k = 1
    else:
        Xext = np.zeros((m, npoly - 1))
        Xcur = 1
        k = 0

    # hard coded to be a shorter length
    if degree == 2:
        Xext[:, k:k + n] = X
        k += n
        Z = np.reshape(X, (m, 1, n))
        X2 = np.zeros((m, 1))
        for i in range(twod(Z).shape[2]):
            X2 = cols((X2, X * Z[:, :, i]))
        X2 = X2[:, 1:]
        idx = np.where((twod(arr(range(1, n + 1))).T >= arr(range(
            1, n + 1))).T.ravel())[0]
        K = len(idx)
        Xext[:, k:k + K] = X2[:, idx]
        return Xext[:, 0:k + K]

    for p in range(degree):

        # workaround to make up for numpy's lack of bsxfun
        if type(Xcur) is int:
            Xcur = X * Xcur
        else:
            new_Xcur = np.zeros((m, 1))
            for i in range(Xcur.shape[2]):
                new_Xcur = cols((new_Xcur, X * Xcur[:, :, i]))
            Xcur = new_Xcur[:, 1:]

        Xcur = Xcur.reshape((m, np.size(Xcur) / m))
        K = Xcur.shape[1]
        Xext[:, k:k + K] = Xcur
        k = k + K
        Xcur = Xcur.reshape((m, 1, K))

    return Xext
    def train(self,
              X,
              Y,
              init='zeros',
              stepsize=.01,
              stopTol=1e-4,
              stopIter=5000):
        """Train the neural network.

        Args:
          X : MxN numpy array containing M data points with N features each
          Y : Mx1 numpy array of targets (class labels) for each data point in X
          sizes : [Nin, Nh1, ... , Nout]
              Nin is the number of features, Nout is the number of outputs,
              which is the number of classes. Member weights are {W1, ... , WL-1},
              where W1 is Nh1 x Nin, etc.
          init : str
              'none', 'zeros', or 'random'.  inits the neural net weights.
          stepsize : scalar
              The stepsize for gradient descent (decreases as 1 / iter).
          stopTol : scalar
              Tolerance for stopping criterion.
          stopIter : int
              The maximum number of steps before stopping.
          activation : str
              'logistic', 'htangent', or 'custom'. Sets the activation functions.

        """
        if self.wts[0].shape[1] - 1 != len(X[0]):
            raise ValueError(
                'layer[0] must equal the number of columns of X (number of features)'
            )

        self.classes = self.classes if len(self.classes) else np.unique(Y)

        if len(self.classes) != self.wts[-1].shape[
                0]:  # and (self.wts[-1].shape[0]!=1 or len(self.classes)!=2):
            raise ValueError(
                'layers[-1] must equal the number of classes in Y, or 1 for binary Y'
            )

        M, N = mat(X).shape  # d = dim of data, n = number of data points
        C = len(self.classes)  # number of classes
        L = len(self.wts)  # get number of layers

        Y_tr_k = to1ofK(Y, self.classes)  # convert Y to 1-of-K format

        # outer loop of stochastic gradient descent
        it = 1  # iteration number
        nextPrint = 1  # next time to print info
        done = 0  # end of loop flag
        J01, Jsur = [], []  # misclassification rate & surrogate loss values

        while not done:
            step_i = float(
                stepsize) / it  # step size evolution; classic 1/t decrease

            # stochastic gradient update (one pass)
            for j in range(M):
                A, Z = self.__responses(twod(
                    X[j, :]))  # compute all layers' responses, then backdrop
                delta = (Z[L] - Y_tr_k[j, :]) * arr(self.dSig0(
                    Z[L]))  # take derivative of output layer

                for l in range(L - 1, -1, -1):
                    grad = delta.T.dot(
                        Z[l])  # compute gradient on current layer wts
                    delta = delta.dot(self.wts[l]) * arr(self.dSig(
                        Z[l]))  # propagate gradient down
                    delta = delta[:, 1:]  # discard constant feature
                    self.wts[
                        l] -= step_i * grad  # take gradient step on current layer wts

            J01.append(self.err_k(X, Y_tr_k))  # error rate (classification)
            Jsur.append(self.mse_k(X, Y_tr_k))  # surrogate (mse on output)

            if it >= nextPrint:
                print('it {} : Jsur = {}, J01 = {}'.format(
                    it, Jsur[-1], J01[-1]))
                nextPrint *= 2

            # check if finished
            done = (it > 1) and (np.abs(Jsur[-1] - Jsur[-2]) <
                                 stopTol) or it >= stopIter
            it += 1
    if Y is not None:
        Y = arr(Y).flatten()
        ny = len(Y)
        if ny > 0:
            assert ny == nx, 'splitData: X and Y must have the same length'
            Ytr, Yte = Y[:ne], Y[ne:]
            to_return += (Ytr, Yte)

    return to_return


if __name__ == "__main__":
    amazon = np.genfromtxt("amazon_cells_labelled.txt",
                           delimiter="\t", dtype=None, encoding='utf-8')
    # amazon = [(string,0),(string,1)....]
    tupleX, tupleY = zip(*amazon)
    #tupleX = (string1, string2, ...)
    #tupleY = (1,0,1,1,0,...)
    X = np.array(tupleX)
    Y = np.array(tupleY)
    X = twod(X).T
    Xtr, Xte, Ytr, Yte = splitData(X, Y, .75)
    """
    model = Sequential()
    model.add(Dense(256,activation='relu',input_shape=(999,),W_regularizer=12(0.001)))
    model.add(Dense(256,activation='relu'))
    model.add(Dense(10,activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    """
Beispiel #48
0
def plotClassify2D(learner,
                   X,
                   Y,
                   pre=lambda x: x,
                   axis=None,
                   nGrid=128,
                   **kwargs):
    """
    Plot data and classifier outputs on two-dimensional data.
    This function plot data (X,Y) and learner.predict(X, Y) 
    together. The learner is is predicted on a dense grid
    covering data X, to show its decision boundary.

    Parameters
    ----------
    learner : learner object
        A trained learner object that inherits from one of
        the 'Classify' or 'Regressor' base classes.
    X : numpy array
        N x M array of data; N = number of data, M = dimension
        (number of features) of data.
    Y : numpy array
        1 x N arra containing labels corresponding to data points
        in X.
    pre : function object (optional)
        Function that is applied to X before prediction.
    axis  : a matplotlib axis / plottable object (optional)
    nGrid : density of 2D grid points (default 128)
    """

    if twod(X).shape[1] != 2:
        raise ValueError(
            'plotClassify2D: function can only be called using two-dimensional data (features)'
        )

    # TODO: Clean up code

    if axis == None: axis = plt
    # hld = axis.ishold();
    # axis.hold(True);
    axis.plot(X[:, 0], X[:, 1], 'k.', visible=False)
    # TODO: can probably replace with final dot plot and use transparency for image (?)
    ax = axis.axis()
    xticks = np.linspace(ax[0], ax[1], nGrid)
    yticks = np.linspace(ax[2], ax[3], nGrid)
    grid = np.meshgrid(xticks, yticks)

    XGrid = np.column_stack((grid[0].flatten(), grid[1].flatten()))
    if learner is not None:
        YGrid = learner.predict(pre(XGrid))
        #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses )
        axis.imshow(YGrid.reshape((len(xticks), len(yticks))),
                    extent=ax,
                    interpolation='nearest',
                    origin='lower',
                    alpha=0.5,
                    aspect='auto')
    cmap = plt.cm.get_cmap()
    # TODO: if Soft: predictSoft; get colors for each class from cmap; blend pred with colors & show
    #
    try:
        classes = np.array(learner.classes)
    except Exception:
        classes = np.unique(Y)
    cvals = (classes - min(classes)) / (max(classes) - min(classes) + 1e-100)
    for i, c in enumerate(classes):
        axis.plot(X[Y == c, 0],
                  X[Y == c, 1],
                  'ko',
                  color=cmap(cvals[i]),
                  **kwargs)
    axis.axis(ax)
Beispiel #49
0
def gmmEM(X, K, init='random', max_iter=100, tol=1e-6):
    """
	Perform Gaussian mixture EM (expectation-maximization) clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters)
				'farthest' (choose cluster 1 uniformly, then the point farthest
					 from all cluster so far, etc.)
				'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of iterations.
	tol : scalar (optional)
		Stopping tolerance.

	Returns
	-------
	z    : 1 x N numpy array of cluster assignments (int indices).
	T    : {'pi': np.array, 'mu': np.array, 'sig': np.array} : Gaussian component parameters
	soft : numpy array; soft assignment probabilities (rounded for assign)
	ll   : float; Log-likelihood under the returned model.
	"""
    # init
    N, D = twod(X).shape  # get data size

    if type(init) is str:
        init = init.lower()
        if init == 'random':
            pi = np.random.permutation(N)
            mu = X[pi[0:K], :]
        elif init == 'farthest':
            mu = k_init(X, K, True)
        elif init == 'k++':
            mu = k_init(X, K, False)
        else:
            raise ValueError('gmmEM: value for "init" ( ' + init +
                             ') is invalid')
    else:
        mu = init

    sig = np.zeros((D, D, K))
    for c in range(K):
        sig[:, :, c] = np.eye(D)
    alpha = np.ones(K) / K
    R = np.zeros((N, K))

    iter, ll, ll_old = 1, np.inf, np.inf
    done = iter > max_iter
    C = np.log(2 * np.pi) * D / 2

    while not done:
        ll = 0
        for c in range(K):
            # compute log prob of all data under model c
            V = X - np.tile(mu[c, :], (N, 1))
            R[:, c] = -0.5 * np.sum(
                (V.dot(np.linalg.inv(sig[:, :, c]))) * V,
                axis=1) - 0.5 * np.log(np.linalg.det(sig[:, :, c])) + np.log(
                    alpha[c]) - C

        # avoid numberical issued by removing constant 1st
        mx = R.max(1)
        R -= np.tile(twod(mx).T, (1, K))
        # exponentiate and compute sum over components
        R = np.exp(R)
        nm = R.sum(1)
        # update log-likelihood of data
        ll = np.sum(np.log(nm) + mx)
        R /= np.tile(twod(nm).T,
                     (1, K))  # normalize to give membership probabilities

        alpha = R.sum(0)  # total weight for each component
        for c in range(K):
            # weighted mean estimate
            mu[c, :] = (R[:, c] / alpha[c]).T.dot(X)
            tmp = X - np.tile(mu[c, :], (N, 1))
            # weighted covar estimate
            sig[:, :, c] = tmp.T.dot(
                tmp * np.tile(twod(R[:, c]).T / alpha[c],
                              (1, D))) + 1e-32 * np.eye(D)
        alpha /= N

        # stopping criteria
        done = (iter >= max_iter) or np.abs(ll - ll_old) < tol
        ll_old = ll
        iter += 1

    z = from1ofK(R)
    soft = R
    T = {'pi': alpha, 'mu': mu, 'sig': sig}

    return z, T, soft, ll
Beispiel #50
0
def data_GMM(N, C, D=2, get_Z=False):
    """
	Sample data from a Gaussian mixture model.  Draws N data x_i from a mixture
	of Gaussians, with C clusters in D dimensions.

	Parameters
	----------
	N : int
		Number of data to be drawn from a mixture of Gaussians.
	C : int
		Number of clusters.
	D : int
		Number of dimensions.
	get_Z : bool
		If True, returns a an array indicating the cluster from which each 
		data point was drawn.

	Returns
	-------
	X : numpy array
		N x D array of data.
	Z : numpy array (optional)
		1 x N array of cluster ids.

	TODO: test more
	"""
    C += 1
    pi = np.zeros(C)
    for c in range(C):
        pi[c] = gamrand(10, 0.5)
    pi = pi / np.sum(pi)
    cpi = np.cumsum(pi)

    rho = np.random.rand(D, D)
    rho = rho + twod(rho).T
    rho = rho + D * np.eye(D)
    rho = sqrtm(rho)

    mu = mat(np.random.randn(c, D)) * mat(rho)

    ccov = []
    for i in range(C):
        tmp = np.random.rand(D, D)
        tmp = tmp + tmp.T
        tmp = 0.5 * (tmp + D * np.eye(D))
        ccov.append(sqrtm(tmp))

    p = np.random.rand(N)
    Z = np.ones(N)

    for c in range(C - 1):
        Z[p > cpi[c]] = c
    Z = Z.astype(int)

    X = mu[Z, :]

    for c in range(C):
        X[Z == c, :] = X[Z == c, :] + mat(np.random.randn(np.sum(Z == c),
                                                          D)) * mat(ccov[c])

    if get_Z:
        return (arr(X), Z)
    else:
        return arr(X)
Beispiel #51
0
    def train(self,
              X,
              Y,
              initStep=1.0,
              stopTol=1e-4,
              stopIter=5000,
              plot=None):
        """ Train the logistic regression using stochastic gradient descent """
        ## First do some bookkeeping and setup:
        self.theta, X, Y = twod(self.theta), arr(X), arr(
            Y)  # convert to numpy arrays
        M, N = X.shape
        if Y.shape[0] != M:
            raise ValueError("Y must have the same number of data (rows) as X")
        self.classes = np.unique(Y)
        if len(self.classes) != 2:
            raise ValueError(
                "Y should have exactly two classes (binary problem expected)")
        if self.theta.shape[
                1] != N + 1:  # if self.theta is empty, initialize it!
            self.theta = np.random.randn(1, N + 1)
        # Some useful modifications of the data matrices:
        X1 = np.hstack((np.ones(
            (M, 1)), X))  # make data array with constant feature
        Y01 = toIndex(Y,
                      self.classes)  # convert Y to canonical "0 vs 1" classes

        it = 0
        done = False
        Jsur = []
        J01 = []
        while not done:
            step = (2.0 * initStep) / (2.0 + it
                                       )  # common 1/iter step size change
            si = []
            for i in range(M):  # for each data point i:
                ## Computing the linear response
                zi = X1[i, :].dot(self.theta.T)
                ## Computing the prediction yi
                yi = Y01[i]
                ## Computing soft response
                si.append(self.logistic(zi))
                ## Computing gradient of logistic loss
                gradi = (si[i] - yi) * X1[i, :]
                # Take a step down the gradient
                self.theta = self.theta - step * gradi

            # each pass, compute surrogate loss & error rates:
            J01.append(self.err(X, Y))
            ## Computing surrogate loss
            sum_i = 0
            for i in range(M):
                sum_i += Y01[i] * si[i] * np.log(
                    si[i]) + (1 - Y01[i]) * (1 - si[i]) * np.log(1 - si[i])
            Jsur.append(sum_i / M)  ## TODO ...

            ## For debugging: print current parameters & losses
            # print self.theta, ' => ', Jsur[-1], ' / ', J01[-1]
            # raw_input()   # pause for keystroke

            # check stopping criteria:
            it += 1
            done = (it > stopIter) or ((it > 1) and
                                       (abs(Jsur[-1] - Jsur[-2]) < stopTol))
        self.numberOfIterations = it
        if self.plotFlag == True:
            plt.semilogx(range(it), np.abs(Jsur), label='Surrogate Loss')
            plt.semilogx(range(it), np.abs(J01), label='Error Rate')
            plt.legend(loc='upper right')
            plt.xlabel('# of iterations')
            plt.ylabel('Losses')
            plt.show()
Beispiel #52
0
def plotClassify2D(learner,
                   X,
                   Y,
                   pre=lambda x: x,
                   ax=None,
                   nGrid=128,
                   cm=None,
                   bgalpha=0.3,
                   soft=False,
                   **kwargs):
    """
    Plot data and classifier outputs on two-dimensional data.
    This function plots data (X,Y) and learner.predict(X, Y)
    together. The learner is is predicted on a dense grid
    covering the data X, to show its decision boundary.

    Parameters
    ----------
    learner : a classifier with "predict" function and optionally "classes" list
    X : (m,n) numpy array of data (m points in n=2 dimension)
    Y : (m,) or (m,1) int array of class values OR (m,c) array of class probabilities (see predictSoft)
    pre   : function object (optional) applied to X before learner.predict()
    ax    : a matplotlib axis / plottable object (optional)
    nGrid : density of 2D grid points (default 128)
    soft  : use predictSoft & blend colors (default: False => use predict() and show decision regions)
    bgalpha: alpha transparency (1=opaque, 0=transparent) for decision function image
    cm    : pyplot colormap (default: None = use default colormap)
    [other arguments will be passed through to the pyplot scatter function on the data points]
    """

    if twod(X).shape[1] != 2:
        raise ValueError(
            'plotClassify2D: function can only be called using two-dimensional data (features)'
        )
    # make robust to differing arguments in scatter vs plot, e.g. "s"/"ms" (marker size)
    if "s" not in kwargs and "ms" in kwargs: kwargs["s"] = kwargs.pop("ms")

    try:
        classes = np.array(learner.classes)
        # learner has explicit list of classes; use those
    except Exception:
        if len(Y.shape) == 1 or Y.shape[1] == 1:
            classes = np.unique(
                Y)  # or, use data points' class values to guess
        else:
            classes = np.arange(Y.shape[1], dtype=int)
            # or, get number of classes from soft predictions

    vmin, vmax = classes.min() - .1, classes.max() + .1
    # get (slightly expanded) value range for class values
    if ax is None:
        ax = plt.gca()
        # default: use current axes
    if cm is None:
        cm = plt.cm.get_cmap()
        # get the colormap
    classvals = (classes - vmin) / (vmax - vmin + 1e-100)
    # map class values to [0,1] for colormap
    classcolor = cm(classvals)
    # and get the RGB values for each class

    ax.plot(X[:, 0], X[:, 1], 'k.', visible=False, ms=0)
    # invisible plot to set axis range if required
    axrng = ax.axis()

    if learner is not None:  # if we were given a learner to predict with:
        xticks, yticks = np.linspace(axrng[0], axrng[1], nGrid), np.linspace(
            axrng[2], axrng[3], nGrid)
        grid = np.meshgrid(xticks, yticks)
        # apply it to a dense grid of points
        XGrid = np.column_stack((grid[0].flatten(), grid[1].flatten()))
        if soft:
            YGrid = learner.predictSoft(pre(XGrid)).dot(classcolor)
            # soft prediction: blend class colors
            YGrid[YGrid < 0] = 0
            YGrid[YGrid > 1] = 1
            YGrid = YGrid.reshape((nGrid, nGrid, classcolor.shape[1]))
            #axis.contourf( xticks,yticks,YGrid[:,0].reshape( (len(xticks),len(yticks)) ), nContours )
            ax.imshow(YGrid,
                      extent=axrng,
                      interpolation='nearest',
                      origin='lower',
                      alpha=bgalpha,
                      aspect='auto',
                      vmin=vmin,
                      vmax=vmax,
                      cmap=cm)
        else:
            YGrid = learner.predict(pre(XGrid)).reshape((nGrid, nGrid))
            # hard prediction: use class colors
            vmin, vmax = min(YGrid.min() - .1,
                             vmin), max(YGrid.max() + .1,
                                        vmax)  # check outputs for new classes?
            classvals = (classes - vmin) / (vmax - vmin + 1e-100)
            classcolor = cm(classvals)
            # if so, recalc colors?
            #axis.contourf( xticks,yticks,YGrid.reshape( (len(xticks),len(yticks)) ), nClasses )
            ax.imshow(YGrid,
                      extent=axrng,
                      interpolation='nearest',
                      origin='lower',
                      alpha=bgalpha,
                      aspect='auto',
                      vmin=vmin,
                      vmax=vmax,
                      cmap=cm)

    if len(Y.shape) == 1 or Y.shape[1] == 1:
        data_colors = classcolor[np.searchsorted(classes, Y)]
        # use colors if Y is discrete class
    else:
        data_colors = Y.dot(classcolor)
        data_colors[data_colors > 1] = 1
        # blend colors if Y is a soft confidence
    ax.scatter(X[:, 0], X[:, 1], c=data_colors, **kwargs)
    def train(self,
              X,
              Y,
              init='zeros',
              stepsize=.01,
              stopTol=1e-4,
              stopIter=5000):
        """Train the neural network.

        Args:
          X : MxN numpy array containing M data points with N features each
          Y : Mx1 numpy array of targets for each data point in X
          sizes (list of int): [Nin, Nh1, ... , Nout]
              Nin is the number of features, Nout is the number of outputs,
              which is the number of target dimensions (usually 1). Weights are {W1, ... , WL-1},
              where W1 is Nh1 x Nin, etc.
          init (str): 'none', 'zeros', or 'random'.  inits the neural net weights.
          stepsize (float): The stepsize for gradient descent (decreases as 1 / iter).
          stopTol (float): Tolerance for stopping criterion.
          stopIter (int): The maximum number of steps before stopping.
          activation (str): 'logistic', 'htangent', or 'custom'. Sets the activation functions.
        """
        if self.wts[0].shape[1] - 1 != len(X[0]):
            raise ValueError(
                'layer[0] must equal the number of columns of X (number of features)'
            )

        if self.wts[-1].shape[0] > 1 and self.wts[-1].shape[0] != Y.shape[1]:
            raise ValueError(
                'layers[-1] must equal the number of classes in Y, or 1 for binary Y'
            )

        M, N = arr(X).shape  # d = dim of data, n = number of data points
        L = len(self.wts)  # get number of layers
        Y = arr(Y)
        Y2d = Y if len(Y.shape) > 1 else Y[:, np.newaxis]

        # outer loop of stochastic gradient descent
        it = 1  # iteration number
        nextPrint = 1  # next time to print info
        done = 0  # end of loop flag
        Jsur = []  # misclassification rate & surrogate loss values

        while not done:
            step_i = (2.0 * stepsize) / (
                2.0 + it)  # step size evolution; classic 1/t decrease

            # stochastic gradient update (one pass)
            for j in range(M):
                A, Z = self.__responses(twod(
                    X[j, :]))  # compute all layers' responses, then backdrop
                delta = (Z[L] - Y2d[j, :]) * arr(self.dSig0(
                    Z[L]))  # take derivative of output layer

                for l in range(L - 1, -1, -1):
                    grad = delta.T.dot(
                        Z[l])  # compute gradient on current layer wts
                    delta = delta.dot(self.wts[l]) * arr(self.dSig(
                        Z[l]))  # propagate gradient down
                    delta = delta[:, 1:]  # discard constant feature
                    self.wts[
                        l] -= step_i * grad  # take gradient step on current layer wts

            Jsur.append(self.mse(X, Y2d))  # surrogate (mse on output)

            if it >= nextPrint:
                print('it {} : J = {}'.format(it, Jsur[-1]))
                nextPrint *= 2

            # check if finished
            done = (it > 1) and (np.abs(Jsur[-1] - Jsur[-2]) <
                                 stopTol) or it >= stopIter
            it += 1
Beispiel #54
0
    def train(self, X, Y, initStep=1.0, stopTol=1e-4, stopIter=5000, plot=None):
        """ Train the logistic regression using stochastic gradient descent """
        ## First do some bookkeeping and setup:
        self.theta,X,Y = twod(self.theta), arr(X), arr(Y)   # convert to numpy arrays
        M,N = X.shape
        if Y.shape[0] != M:
            raise ValueError("Y must have the same number of data (rows) as X")
        self.classes = np.unique(Y)
        if len(self.classes) != 2:
            raise ValueError("Y should have exactly two classes (binary problem expected)")
        if self.theta.shape[1] != N+1:         # if self.theta is empty, initialize it!
            self.theta = np.random.randn(1,N+1)
        # Some useful modifications of the data matrices:
        X1  = np.hstack((np.ones((M,1)),X))    # make data array with constant feature
        Y01 = toIndex(Y, self.classes)         # convert Y to canonical "0 vs 1" classes

        it   = 0
        done = False
        Jsur = []
        J01  = []
        Jloss = 0

        while not done:
            step = (2.0 * initStep) / (2.0 + it)   # common 1/iter step size change

            for i in range(M):  # for each data point i:
                ## compute zi = linear response of X[i,:]
                zi = np.dot(np.array(X1[i,:]),np.transpose(self.theta))
                ## compute prediction yi
                ## compute soft response si = logistic(zi)
                si = (math.exp(zi))/(1+math.exp(zi))
                if si >= 0.5:
                    yi = 1
                else:
                    yi = 0
                ## compute gradient of logistic loss wrt data point i:
                gradi = (si - Y01[i])*X1[i,:]
                # Take a step down the gradient
                self.theta = self.theta - step * gradi
                Ji = -yi*math.log(si) - (1-yi)*math.log(1-si)
                Jloss = Jloss+Ji
            if it == 0:
                self.theta = list(self.theta[0])
            elif it >= 1:
                self.theta = list(self.theta)
            # each pass, compute surrogate loss & error rates:
            J01.append( self.err(X,Y) )
            ## compute surrogate loss (logistic negative log-likelihood)

            Jloss = Jloss/M
            ##  Jsur = sum_i [ (si log si) if yi==1 else ((1-si)log(1-si)) ]
            Jsur.append(Jloss) ## TODO ...

            ## For debugging: print current parameters & losses
            # print(self.theta, ' => ', Jsur[-1], ' / ', J01[-1])
            # raw_input()   # pause for keystroke

            # check stopping criteria:
            it += 1
            done = (it > stopIter) or ( (it>1) and (abs(Jsur[-1]-Jsur[-2])<stopTol) )
        return [it, J01, Jsur]