def initclusterKA(X, K, distance='euclidean'): M = X.shape[0] Dist = spdist.pdist(X, metric=distance) # MxM (condensed) Dist = spdist.squareform(Dist) # MxM ResultInd = [0 for i in range(K)] Xmean = np.mean(X, axis=0) # The mean of all rows of X Dmean = spdist.cdist( X, [Xmean]) # Distances between rows of X and the mean of X # The first centre is the closest point to the mean: ResultInd[0] = np.argmin(Dmean) io.updateparallelprogress(K) for k in range(K - 1): D = np.min(Dist[:, ResultInd[0:k + 1]], axis=1) # Mx1 C = [0 for m in range(M)] # M points (e.g. genes) # Mx1 for m in range(M): if m in ResultInd: continue tmp = D - Dist[:, m] # Mx1 differences tmp[tmp < 0] = 0 # All negatives make them zeros C[m] = np.sum(tmp) ResultInd[k + 1] = np.argmax(C) io.updateparallelprogress(K) Result = X[ ResultInd] # These points are the selected K initial cluster centres return Result
def mseclusters(X, B, donormalise=True, GDM=None): Xloc = np.array(X) Bloc = np.array(B) if ds.maxDepthOfArray(Xloc) == 2: Xloc = np.expand_dims(Xloc, axis=0) Nx = len(Xloc) # Number of datasets if len(Bloc.shape) == 1: Bloc = Bloc.reshape(-1, 1) M = Bloc.shape[0] # Number of genes K = Bloc.shape[1] # Number of clusters if GDM is None: GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool) else: GDMloc = np.array(GDM) # I commented these two lines after adding GDM #if any([True if x.shape[0] != M else False for x in Xloc]): # raise ValueError('Unequal number of genes in datasets and partitions') mseC = np.zeros([Nx, K], dtype=float) Nk = [np.sum(b) for b in Bloc.transpose()] # Number of genes per cluster Nd = [x.shape[1] for x in Xloc] # Number of dimensions per dataset # Normalise if needed if donormalise: Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc] # Calculations for nx in range(Nx): reportedprogress = 0 for k in range(K): # Report progress if (k - reportedprogress == 100): io.updateparallelprogress(100) reportedprogress = k # WORK if not any(Bloc[:, k]): mseC[nx, k] = float('nan') else: Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :] tmp = nu.subtractaxis(Xlocloc, np.mean(Xlocloc, axis=0), axis=0) tmp = np.sum(np.power(tmp, 2)) mseC[nx, k] = tmp / Nd[nx] / Nk[k] # Report progress if (K > reportedprogress): io.updateparallelprogress(K - reportedprogress) return np.mean(mseC, axis=0)
def clusterdataset(X, K, methods=None, datasetID=-1): if methods is None: methods = [['k-means']] methodsloc = [ n if isinstance(n, (list, tuple, np.ndarray)) else [n] for n in methods ] # Clustering loop C = len(methodsloc) # Number of methods U = [None] * C for ms in range(C): if methodsloc[ms][0].lower() in ['k-means', 'kmeans']: U[ms] = ckmeans(X, K, datasetID, methodsloc[ms][1:]) elif methodsloc[ms][0].lower() in ['hc', 'hierarchical']: U[ms] = chc(X, K, methodsloc[ms][1:]) io.updateparallelprogress(K * C) return U
def initclusterKA_memorysaver(X, K, distance='euclidean'): M = X.shape[0] #Dist = spdist.pdist(X, metric=distance) # MxM (condensed) #Dist = spdist.squareform(Dist) # MxM ResultInd = [0 for i in range(K)] Xmean = np.mean(X, axis=0) # The mean of all rows of X Dmean = spdist.cdist( X, [Xmean], metric=distance) # Distances between rows of X and the mean of X # The first centre is the closest point to the mean: ResultInd[0] = np.argmin(Dmean) io.updateparallelprogress(K) for k in range(K - 1): D = spdist.cdist( X, X[ResultInd[0:k + 1]], metric=distance ) # (M)x(k+1) Dists of objects to the selected centres D = np.min( D, axis=1 ) # Mx1: Distances of each of the M objects to its closest already selected centre C = [0 for m in range(M)] # M objects (e.g. genes) # Mx1 for m in range(M): if m in ResultInd: continue Dists_m = spdist.cdist( X, [X[m] ]) # Mx1 distances between all M objects and the m_th object tmp = D.reshape(M, 1) - Dists_m # Mx1 differences tmp[tmp < 0] = 0 # All negatives make them zeros C[m] = np.sum(tmp) ResultInd[k + 1] = np.argmax(C) io.updateparallelprogress(K) Result = X[ ResultInd] # These objects are the selected K initial cluster centres return Result