Ejemplo n.º 1
0
def splitData(X,y,portion,seed):

	startLog(__name__)
	logger = logging.getLogger(__name__)
	records = {'portion':portion, 'seed':seed}
	logger.info('split data into train and test %s',records)

	if not isinstance(X, np.ndarray):
		logger.debug('X is not a nparray, converted')
		X = np.array(X)
		y = np.array(y)
	(rows, cols) = np.shape(X)
	size_data = rows
	index = np.random.permutation(size_data)

	Ntr = math.floor(portion*size_data)

	idx_test = index[0:Ntr]
	idx_train = index[Ntr:]
	X_test = X[idx_test,:]
	X_train = X[idx_train,:]

	y_test = y[idx_test]
	y_train = y[idx_train]

	logger.info('split done check split')

	# check split
	(rows_test, cols_test) = np.shape(X_test)
	(rows_train, cols_train) = np.shape(X_train)
	if rows_train < rows_train:
		logger.warning('test set larger than training set')

	logger.info('size of X_test is %d and X_train %d, #fts %d',rows_test,rows_train,cols_train)
	return X_test, X_train, y_train, y_test
Ejemplo n.º 2
0
def kmeansClustering(X, k, maxIters = 10):
    """
        X: data in shape N X D
        k: number of cluster
    """
    my_io.startLog(__name__)
    logger = logging.getLogger(__name__)

    X = normalize(X)
    #X = np.transpose(X)

    (N, D) = np.shape(X)
 
    ax = plt.gca()
    # colors = ['r' if i==0 else 'g' for i in ?]
    colors = 'r'
    ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8)
    # plt.show()

    # initialize
    # D = np.zeros((0,1))
    min_x = np.amin(X)
    max_x = np.amax(X)
    # shape of mu: K X D
    mu_old = np.zeros((k, D))

    # mu_old = [ randomInit(max_x, min_x, D) for idk in range(k)]
    # mu_old = np.array(mu_old)
    mu_old = np.array(random.sample(X,k))
    logger.info('randomly initialize mean vector, check dimension: ')
    logger.info('pass' if ((k, D) == np.array(mu_old)).all 
                else logger.error('failed'))

    
    logger.info('maximum iteration: %s start iteration',
                str(maxIters))

    loss = np.zeros((N,1))
    for i in range(maxIters):
        (Ln, r, Mu) = kmeans_update(X, mu_old)

        loss[i] = Ln
        logger.info('iteration %d, loss %4f \n', i, Ln)

        mu_old = Mu

    logger.info('done loss %3f', Ln)

    ax = plt.gca()
    colors_list = ['r','g','b','y']
    colors = [colors_list[int(r[i])] for i in range(N)]
    # colors = 'r'
    ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8)
    # plt.show()
    return X, r
Ejemplo n.º 3
0
Archivo: gmm.py Proyecto: ZENGXH/kaggle
def gmmClustering(X, k = 2, maxiter = 3):
	my_io.startLog(__name__)
	logger = logging.getLogger(__name__)
	X, r = kmeans.kmeansClustering(X, 2, 1)
	(N, D) = np.shape(X)
	pi_k_old = [np.divide(len(np.where(r==kth)[0]), float(N)) 
			for kth in range(k) ]

	# mu_old = compute_muk(X, k, r)
	cova_old, mu_old = compute_cova(k, X, r)

	for i in range(maxiter):
		#if i==1:
		#	pdb.set_trace()
		logger.info('ite: %d loss: %f',i, 
					loss(X, mu_old, pi_k_old, cova_old)	)	

		p = gmm_Esteps(X, pi_k_old, k, cova_old, mu_old)

		mu_new, cova_new, pi_k_new = gmm_Msteps(mu_old, X, p)
		
		pi_k_old = pi_k_new
		mu_old = mu_new
		cova_old = cova_new
	#matplotlib.cm=get_cmap("jet")
	cm = plt.get_cmap('jet') 
	ax = plt.gca()
    # colors = ['r' if i==0 else 'g' for i in ?]
    #colors = 'r'
    #ax.scatter(X[:,0], X[:,1], c=colors,alpha=0.8)
    # plt.show()
	for j in range(N):
		likehood = p[1][j]
		color = cm(likehood)
		plt.plot(X[j,0], X[j,1]  ,"o", color=color) 
	plt.show()

	for j in range(N):
		likehood = p[0][j]
		color = cm(likehood)
		plt.plot(X[j,0], X[j,1]  ,"o", color=color) 
	plt.show()
	
Ejemplo n.º 4
0
# svm_bench.py
import logging
import my_io
import classification_baseline
from sklearn import svm
import numpy as np
from sklearn.metrics import classification
from numpy import linalg as LA

my_io.setUp('./biological_response/')

my_io.startLog(__name__)
logger = logging.getLogger(__name__)

y,X,trainData,testData = my_io.readCsv()
portion = 0.2
seed = 1
X_test, X_train, y_train, y_test = classification_baseline.splitData(X,y,portion,seed)

logger.info('init svm classifier')
svc = svm.SVC(probability = True)
logger.info('fitting svc')
svc.fit(X_train, y_train)
logger.info('start predict')
predict_probs = svc.predict_proba(X_test)

predict = my_io.toZeroOne(predict_probs)
# error = classification.zero_one_loss(y_test, predict)
loss = np.subtract(predict,y_test)

error = LA.norm(loss)
Ejemplo n.º 5
0
def kmeans_update(X, Mu):
    """update r and Mu given X and Mu
        X is [N D]data
        Mu is [K D] mean vector
        r is 1xN responsibility vector,
        e.g. r = [1,2,1] for 2 clusters 3 data points
        Ln is 1xN minimum distance to its center for each point n
    """

    my_io.startLog(__name__)
    logger = logging.getLogger(__name__)
    
    # initialize
    (K, D1) = np.shape(Mu)
    (N, D) = np.shape(X)
    logger.info('check shape mu: %s X: %s',
                str(np.shape(Mu)),str(np.shape(X)))
    logger.info('pass' if D1==D else logger.error('failed'))
    r = np.zeros((1, N))
    # Mu = zeros(D,K)
    Ln = np.zeros((N,1))
    r = np.zeros((N,1))
    dis2Muk = np.zeros((K,1))

    for n in range(0, N):
        """for each point
          assign x_n to the nearest group
        """
        xn = X[n,:]
        
        # for each cluster, compute the error
        for k in range(0, K):
            """for each group
              calculate the distane from point x_n to mu_k
            """
            dis2Muk[k] = LA.norm(np.subtract(xn, Mu[k,:]))
            # print('dis to cluster %d is %.2f \n',k,dis2Muk(k))
            # np.amin(a, axis=1) return Minima along the second axis
            # (i,j)=np.unravel_index(dis2Muk.argmin(), dis2Muk.shape)

        """ find the minimum distance, ie. the nearest group
            assigh r_n
        """
        indexk = np.argmin(dis2Muk)
        # (minVal, indexK) = min(dis2Muk)
        # assigh cluster
        # compute r

        try:
            r[n] = indexk
        except Exception as err:
            logger.exception('distance to mean vector should be 1D array,'
                             ' now ', np.shape(dis2Muk))
            # fprintf('assign to cluster: %d \n',r(n))
    
    # compute Mu for each k
    """
      for each group:
      update mean vector
    """
    # r is assighment of xk
    # pdb.set_trace()
    for k in range(0, K):
        # indexArray = r == k
        xk = [X[idx,:] for idx in range(len(r)) if r[idx] == k] 
        # xk: Nk x D
        logger.info(' in group %d, num of points %s', k, str(np.shape(xk)))
        Mu[k,:] = np.mean(xk, 0)  # 0: cal mean along each column

    """for each point,
        cal the distance to new mean vector u_k, k = r[n]
    """
    Ln = [LA.norm(np.subtract(X[n, :], Mu[int(r[n]), :])) for n in range(0, N)]
    
    logging.info('cal disortion measurement for each x,'
                 ' check dimension OK' if len(Ln) == n else 'error here')
    # Ln[n] = norm(X[:, n]-Mu[:, r(n)])
    # print Ln
    Ln_mean = np.mean(Ln)
    return Ln_mean, r, Mu