コード例 #1
0
def clustDist(B1, B2, X=None, criterion='direct_euc'):
    if criterion == 'direct_euc':
        D = nu.dist_matrices(B1.transpose(), B2.transpose())
    elif criterion == 'centres_euc':
        centres1 = nu.divideaxis(np.dot(B1.transpose(), X),
                                 np.sum(B1, axis=0),
                                 axis=1)
        centres2 = nu.divideaxis(np.dot(B2.transpose(), X),
                                 np.sum(B2, axis=0),
                                 axis=1)

        D = nu.dist_matrices(centres1, centres2)
        if np.any(~np.isnan(D)):
            m = np.max(D[~np.isnan(D)])
            D[np.isnan(D)] = m + 1
        else:
            D = np.ones(D.shape)
    elif criterion == 'union_std':
        K1 = B1.shape[1]
        K2 = B2.shape[1]
        D = np.zeros([K1, K2])

        for k1 in range(K1):
            for k2 in range(K2):
                bUnion = np.max([B1[:, k1], B2[:, k2]], axis=0)  # (1)x(Ng)
                bCentre = np.dot(bUnion, X) / np.sum(bUnion)  # (1)x(Xdim)
                distsFromCentre = nu.dist_matrices(X, bCentre)  # (Ng)x(1)
                D[k1, k2] = np.dot(bUnion, distsFromCentre) / np.sum(
                    bUnion)  # (1)x(1)
    elif criterion == 'hamming':
        D = nu.dist_matrices(B1.transpose(),
                             B2.transpose(),
                             criterion='hamming')
    else:
        raise ValueError('Invalid distance criterion provided.')

    return D
コード例 #2
0
def normaliseSampleFeatureMat(X, type):
    """
    X = normalizeSampleFeatureMat(X, type)

    type: 0 (none), 1 (divide by mean), 2 (divide by the first),
        3 (take log2), 31 (take log2 after setting all values < 1.0 to 1.0, i.e. guarantee positive log),
        4 (subtract the mean and divide by the std),
        5 (divide by the sum), 6 (subtract the mean),
        7 (divide by the max), 8 (2 to the power X), 9 (subtract the min),
        10 (rank: 1 for lowest, then 2, 3, ...; average on ties),
        11 (rank, like 10 but order arbitrarly on ties),
        12 (normalise to the [0 1] range),
        13 (Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows)

        101 (quantile), 102 (subtract columns (samples) means),
        103 (subtract global mean)

        1000 (Automatically detect normalisation)

    If (type) was a vector like [3 1], this means to apply normalisation
    type (3) over (X) then to apply type (1) over the result. And so on.

    :param X:
    :param type:
    :return:
    """
    Xout = np.array(X)
    codes = np.array(
        type
    )  # stays as input types unless auto-normalisation (type 1000) changes it

    if isinstance(type, (list, tuple, np.ndarray)):
        # This has a reason, which is if there is a single type (1000), it will replace it with the actual codes
        j = 0
        for i in range(len(type)):
            Xout, codesi = normaliseSampleFeatureMat(Xout, type[i])
            if isinstance(codesi, (list, tuple, np.ndarray)) & codesi.ndim > 0:
                codes[j] = codesi[0]
                codes = np.insert(codes, j + 1, codesi[1:])
                j = j + len(codesi)
            else:
                j = j + 1
        return Xout, codes

    if type == 1:
        # 1: Divide by the mean
        Xout = nu.divideaxis(Xout, np.mean(Xout, axis=1), 1)

    if type == 2:
        # 2: Divide by the first value
        Xout = nu.divideaxis(Xout, Xout[:, 1], 1)

    if type == 3:
        # 3: Take log2
        Xout[Xout <= 0] = float('nan')
        Xout = np.log2(Xout)
        ind1 = np.any(isnan(Xout), axis=1)
        Xout[ind1] = fixnans(Xout[ind1])

    if type == 31:
        # 31: Set all values < 1 to 1 then take log (guarantee a positive log)
        Xout[Xout <= 1] = 1
        Xout = np.log2(Xout)

    if type == 4:
        # 4: Subtract the mean and divide by the std
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)
        ConstGenesIndices = np.std(Xout, axis=1) == 0
        Xout = nu.divideaxis(Xout, np.std(Xout, axis=1), axis=1)
        Xout[ConstGenesIndices] = 0

    if type == 5:
        # 5: Divide by the sum
        Xout = nu.divideaxis(Xout, np.sum(Xout, axis=1), axis=1)

    if type == 6:
        # 6: Subtract the mean
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)

    if type == 7:
        # 7: Divide by the maximum
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 8:
        # 8: (2 to the power X)
        Xout = np.power(2, Xout)

    if type == 9:
        # 9: Subtract the min
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)

    if type == 10:
        # 10: Rank: 0 for lowest, then 1, 2, ...; average on ties
        Xout = spmstats.rankdata(Xout, axis=0) - 1

    if type == 11:
        # 11: Rank: 0 for lowest, then 1, 2, ...; arbitrary order on ties
        Xout = np.argsort(np.argsort(Xout, axis=0), axis=0)

    if type == 12:
        # 12: Normalise to the [0 1] range
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 13:
        # 13: Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows
        Xout = filterBimodal(X)

    # 100s
    if type == 101:
        # 101: quantile
        av = np.mean(np.sort(Xout, axis=0), axis=1)
        II = np.argsort(np.argsort(Xout, axis=0), axis=0)
        Xout = av[II]

    if type == 102:
        # 102: subtract the mean of each sample (column) from it
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=0), axis=0)

    if type == 103:
        # 103: subtract the global mean of the data
        Xout -= np.mean(Xout)

    if type == 1000:
        # 1000: automatically detect normalisation
        codes = autoNormalise(Xout)
        Xout = normaliseSampleFeatureMat(Xout, codes)[0]
        codes = np.append([101], codes)

    return Xout, codes
コード例 #3
0
def generateCoPaM(U,
                  relabel_technique='minmin',
                  w=None,
                  X=None,
                  distCriterion='direct_euc',
                  K=0,
                  GDM=None):
    # Helping functions
    def calwmeans(w):
        wm = [
            np.mean(calwmeans(ww)) if isinstance(ww,
                                                 (list, tuple,
                                                  np.ndarray)) else np.mean(ww)
            for ww in w
        ]
        return np.array(wm)

    def CoPaMsdist(CoPaM1, CoPaM2):
        return np.linalg.norm(CoPaM1 - CoPaM2)

    def orderpartitions(U, method='rand', X=None, GDM=None):
        if method == 'rand':
            return np.random.permutation(range(len(U))), None
        elif method == 'mn':
            # TODO: Implement ranking partitions based on M-N plots
            raise NotImplementedError(
                'Ranking partitions based on the M-N plots logic has not been implemented yet.'
            )
        elif method == 'mse':
            R = len(U)
            mses = np.zeros(R)
            for r in range(R):
                if isinstance(U[r][0][0], (list, tuple, np.ndarray)):
                    mses[r] = np.mean(
                        orderpartitions(U[r], method=method, X=X, GDM=GDM)[1])
                else:
                    mses[r] = np.mean([
                        mn.mseclustersfuzzy(X,
                                            U[r],
                                            donormalise=False,
                                            GDM=GDM)
                    ])
            order = np.argsort(mses)
            return order, mses[order]

    # Fix parameters
    Uloc = ds.listofarrays2arrayofarrays(U)
    R = len(Uloc)
    if GDM is None:
        GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool)
    elif GDM.shape[1] == 1:
        if R > 1:
            GDMloc = np.tile(GDM, [1, R])
        else:
            GDMloc = np.array(GDM)
    else:
        GDMloc = np.array(GDM)
    if w is None or (w is str and w in ['all', 'equal']):
        w = np.ones(R)
    elif ds.numel(w) == 1:
        w = np.array([w for i in range(R)])
    wmeans = calwmeans(w)

    # Work!
    #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0]
    if GDM is None:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0]
    else:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0]
    Uloc = Uloc[permR]
    if GDMloc.shape[1] > 1:
        GDMloc = GDMloc[:, permR]
    wmeans = wmeans[permR]

    if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)):
        Uloc[0] = generateCoPaM(Uloc[0],
                                relabel_technique=relabel_technique,
                                w=w[0],
                                X=X,
                                distCriterion=distCriterion,
                                K=K,
                                GDM=GDMloc)
    #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float)
    CoPaM = np.array(Uloc[0], dtype=float)
    K = CoPaM.shape[1]
    for r in range(1, R):
        if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)):
            Uloc[r] = generateCoPaM(Uloc[r],
                                    relabel_technique=relabel_technique,
                                    w=w[r],
                                    X=X,
                                    distCriterion=distCriterion,
                                    K=K,
                                    GDM=GDMloc)
        if Uloc[r].shape[1] != K:
            raise ValueError(
                'Inequal numbers of clusters in the partition {}.'.format(r))

        Uloc[r] = relabelClusts(CoPaM,
                                Uloc[r],
                                method=relabel_technique,
                                X=X,
                                distCriterion=distCriterion)

        dotprod = np.dot(GDMloc[:, 0:r],
                         wmeans[0:r].transpose())  # (Mxr) * (rx1) = (Mx1)
        CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0],
                                             dotprod[dotprod > 0],
                                             axis=1)
        CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0]
        dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose())
        CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0],
                                           dotprod[dotprod > 0],
                                           axis=1)

    return CoPaM
コード例 #4
0
def correcterrors_weighted_outliers2(B,
                                     X,
                                     GDM,
                                     clustdists=None,
                                     stds=3,
                                     smallestClusterSize=11):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustweights = np.ones(K)
    else:
        clustweights = np.min(clustdists) / clustdists

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            csize = np.sum(Bloc[GDM[:, l], k])
            tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                     Cmeans[l][k],
                                     axis=0)
            SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
            # Added this in this version
            w[gi:(gi + csize)] = clustweights[k]
            gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])
        SCG[l] = np.sort(SCG[l], axis=0)
        SCGmeans = np.average(SCG[l], weights=w, axis=0)
        SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0)
        SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0),
                                SCGstds,
                                axis=0)  # No. of stds away
        SCGouts = SCGouts > stds  # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions)
        SCG[l][
            SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on

    # Helping function
    def iswithinworse(ref, x):
        return x <= np.max(ref)

    # Find who belongs
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            for d in range(Xloc[l].shape[1]):
                tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
                belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters:
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out