Example #1
0
def initclusterKA(X, K, distance='euclidean'):
    M = X.shape[0]
    Dist = spdist.pdist(X, metric=distance)  # MxM (condensed)
    Dist = spdist.squareform(Dist)  # MxM
    ResultInd = [0 for i in range(K)]
    Xmean = np.mean(X, axis=0)  # The mean of all rows of X
    Dmean = spdist.cdist(
        X, [Xmean])  # Distances between rows of X and the mean of X

    # The first centre is the closest point to the mean:
    ResultInd[0] = np.argmin(Dmean)
    io.updateparallelprogress(K)

    for k in range(K - 1):
        D = np.min(Dist[:, ResultInd[0:k + 1]], axis=1)  # Mx1
        C = [0 for m in range(M)]  # M points (e.g. genes)  # Mx1
        for m in range(M):
            if m in ResultInd:
                continue
            tmp = D - Dist[:, m]  # Mx1 differences
            tmp[tmp < 0] = 0  # All negatives make them zeros
            C[m] = np.sum(tmp)
        ResultInd[k + 1] = np.argmax(C)
        io.updateparallelprogress(K)

    Result = X[
        ResultInd]  # These points are the selected K initial cluster centres

    return Result
Example #2
0
def mseclusters(X, B, donormalise=True, GDM=None):
    Xloc = np.array(X)
    Bloc = np.array(B)

    if ds.maxDepthOfArray(Xloc) == 2:
        Xloc = np.expand_dims(Xloc, axis=0)
    Nx = len(Xloc)  # Number of datasets
    if len(Bloc.shape) == 1:
        Bloc = Bloc.reshape(-1, 1)
    M = Bloc.shape[0]  # Number of genes
    K = Bloc.shape[1]  # Number of clusters

    if GDM is None:
        GDMloc = np.ones([Bloc.shape[0], Nx], dtype=bool)
    else:
        GDMloc = np.array(GDM)

    # I commented these two lines after adding GDM
    #if any([True if x.shape[0] != M else False for x in Xloc]):
    #    raise ValueError('Unequal number of genes in datasets and partitions')

    mseC = np.zeros([Nx, K], dtype=float)

    Nk = [np.sum(b) for b in Bloc.transpose()]  # Number of genes per cluster
    Nd = [x.shape[1] for x in Xloc]  # Number of dimensions per dataset

    # Normalise if needed
    if donormalise:
        Xloc = [pp.normaliseSampleFeatureMat(x, 4) for x in Xloc]

    # Calculations
    for nx in range(Nx):
        reportedprogress = 0
        for k in range(K):
            # Report progress
            if (k - reportedprogress == 100):
                io.updateparallelprogress(100)
                reportedprogress = k
            # WORK
            if not any(Bloc[:, k]):
                mseC[nx, k] = float('nan')
            else:
                Xlocloc = Xloc[nx][Bloc[GDMloc[:, nx], k], :]
                tmp = nu.subtractaxis(Xlocloc,
                                      np.mean(Xlocloc, axis=0),
                                      axis=0)
                tmp = np.sum(np.power(tmp, 2))
                mseC[nx, k] = tmp / Nd[nx] / Nk[k]
        # Report progress
        if (K > reportedprogress):
            io.updateparallelprogress(K - reportedprogress)

    return np.mean(mseC, axis=0)
Example #3
0
def clusterdataset(X, K, methods=None, datasetID=-1):
    if methods is None: methods = [['k-means']]
    methodsloc = [
        n if isinstance(n, (list, tuple, np.ndarray)) else [n] for n in methods
    ]
    # Clustering loop
    C = len(methodsloc)  # Number of methods
    U = [None] * C
    for ms in range(C):
        if methodsloc[ms][0].lower() in ['k-means', 'kmeans']:
            U[ms] = ckmeans(X, K, datasetID, methodsloc[ms][1:])
        elif methodsloc[ms][0].lower() in ['hc', 'hierarchical']:
            U[ms] = chc(X, K, methodsloc[ms][1:])

    io.updateparallelprogress(K * C)

    return U
Example #4
0
def initclusterKA_memorysaver(X, K, distance='euclidean'):
    M = X.shape[0]
    #Dist = spdist.pdist(X, metric=distance)  # MxM (condensed)
    #Dist = spdist.squareform(Dist)  # MxM
    ResultInd = [0 for i in range(K)]
    Xmean = np.mean(X, axis=0)  # The mean of all rows of X
    Dmean = spdist.cdist(
        X, [Xmean],
        metric=distance)  # Distances between rows of X and the mean of X

    # The first centre is the closest point to the mean:
    ResultInd[0] = np.argmin(Dmean)
    io.updateparallelprogress(K)

    for k in range(K - 1):
        D = spdist.cdist(
            X, X[ResultInd[0:k + 1]], metric=distance
        )  # (M)x(k+1) Dists of objects to the selected centres
        D = np.min(
            D, axis=1
        )  # Mx1: Distances of each of the M objects to its closest already selected centre
        C = [0 for m in range(M)]  # M objects (e.g. genes)  # Mx1
        for m in range(M):
            if m in ResultInd:
                continue
            Dists_m = spdist.cdist(
                X,
                [X[m]
                 ])  # Mx1 distances between all M objects and the m_th object
            tmp = D.reshape(M, 1) - Dists_m  # Mx1 differences
            tmp[tmp < 0] = 0  # All negatives make them zeros
            C[m] = np.sum(tmp)
        ResultInd[k + 1] = np.argmax(C)
        io.updateparallelprogress(K)

    Result = X[
        ResultInd]  # These objects are the selected K initial cluster centres

    return Result