Ejemplo n.º 1
0
def fuzzystretch(X, x0=None):
    Xloc = np.array(X)
    if x0 is None:
        x0 = np.array([np.mean(xrow[xrow > 0]) for xrow in Xloc])
        x0[x0 == 1] = 0.5
    elif ds.numel(x0) == 1:
        x0 = np.array([x0 for i in range(Xloc.shape[0])])
    elif ds.numel(x0) != Xloc.shape[0]:
        raise ValueError(
            'The parameter x0 should either be a single value or a vector of length equal to the number '
            'of rows in X. It can also be left ungiven as it has a default value.'
        )

    y = np.zeros(Xloc.shape)
    for i in range(Xloc.shape[0]):
        xrow = Xloc[i]
        xt = xrow
        xt[xrow < x0[i]] = (np.pi * xrow[xrow < x0[i]]) / (2 *
                                                           x0[i]) - np.pi / 2
        xt[xrow >= x0[i]] = (xrow[xrow >= x0[i]] -
                             x0[i]) * np.pi / (2 * (1 - x0[i]))

        yt = np.zeros(len(xt))
        yt[xrow < x0[i]] = x0[i] + x0[i] * np.sin(xt[xrow < x0[i]])
        yt[xrow >= x0[i]] = x0[i] + (1 - x0[i]) * np.sin(xt[xrow >= x0[i]])

        y[i] = yt

    return y
Ejemplo n.º 2
0
def percentage_less_than(X, v):
    """
    Percentage of elements in matrix X that are less than the value v
    :param X: Matrix of numbers (numpy array)
    :param v: A value to be compared with
    :return: A percentage in the range [0.0, 1.0]
    """
    return np.sum(X < v) * 1.0 / ds.numel(X)
Ejemplo n.º 3
0
def isnan(X):
    if ds.numel(X) == 1:
        return math.isnan(X)
    elif len(np.shape(X)) == 1:
        res = np.zeros(np.shape(X), dtype=bool)
        for i in range(len(X)):
                res[i] = math.isnan(X[i])
        return res
    elif len(np.shape(X)) == 2:
        res = np.zeros(np.shape(X), dtype=bool)
        for i in range(np.size(X, 0)):
            for j in range(np.size(X, 1)):
                res[i, j] = math.isnan(X[i, j])
        return res
Ejemplo n.º 4
0
def autoNormalise(X):
    """
    Automatically normalise dataset X and filter it if needed

    :param X: Dataset matrix (numpy array)
    :return: array of normalisation codes
    """
    Xloc = np.array(X)

    twosided = np.sum(Xloc < 0) > 0.2 * np.sum(Xloc > 0)  # negative values are at least 20% of positive values
    alreadylogs = np.sum(abs(Xloc) < 30) > 0.98 * ds.numel(Xloc)  # More than 98% of values are below 30.0

    if twosided:
        return np.array([4])
        #return np.array([101, 4])
    else:
        Xloc[isnan(Xloc)] = 0.0
        Xloc[Xloc < 0] = 0.0
        if alreadylogs:
            Xf = normaliseSampleFeatureMat(Xloc, [13])[0]
            if isnormal_68_95_99p7_rule(Xf)[1] < isnormal_68_95_99p7_rule(Xloc)[1]:
                return np.array([13, 4])
            else:
                return np.array([4])
        else:
            Xl = normaliseSampleFeatureMat(Xloc, [3])[0]  # index 1  (Xloc, i.e. original X is index 0)
            Xlp = normaliseSampleFeatureMat(Xloc, [31])[0]  # index 2
            Xf = normaliseSampleFeatureMat(Xloc, [13])[0]  # index 3
            Xlf = normaliseSampleFeatureMat(Xl, [13])[0]  # index 4
            Xlpf = normaliseSampleFeatureMat(Xlp, [13])[0]  # index 5
            isnormal_stats = [isnormal_68_95_99p7_rule(Xloc)[1], isnormal_68_95_99p7_rule(Xl)[1],
                              isnormal_68_95_99p7_rule(Xlp)[1], isnormal_68_95_99p7_rule(Xf)[1],
                              isnormal_68_95_99p7_rule(Xlf)[1], isnormal_68_95_99p7_rule(Xlpf)[1]]
            most_normal_index = np.argmin(isnormal_stats)
            if most_normal_index == 0:
                return np.array([4])
            elif most_normal_index == 1:
                return np.array([3, 4])
            elif most_normal_index == 2:
                return np.array([31, 4])
            elif most_normal_index == 3:
                return np.array([13, 4])
            elif most_normal_index == 4:
                return np.array([3, 13, 4])
            elif most_normal_index == 5:
                return np.array([31, 13, 4])
            else:
                raise ValueError('You should never reach this error. Please contact {0}'.format(glob.email))
Ejemplo n.º 5
0
def filterFlat(X, GDM, Iincluded):
    Xloc = np.array(X)
    GDMloc = np.array(GDM)
    Iincludedloc = np.array(Iincluded)
    if ds.numel(GDMloc) == 0:
        return Xloc, GDMloc, Iincludedloc  # If input dataset is empty
    L = len(Xloc)
    Ng = GDMloc.shape[0]
    Iincluded2 = np.array([False for i in range(Ng)])
    for l in range(L):
        Iincluded2[GDMloc[:, l]] = np.bitwise_or(Iincluded2[GDMloc[:,l]], np.std(Xloc[l], axis=1) > 0)

    for l in range(L):
        Xloc[l] = Xloc[l][Iincluded2[GDMloc[:,l]]]
    GDMloc = GDMloc[Iincluded2]
    Iincludedloc[Iincludedloc] = Iincluded2
    return Xloc, GDMloc, Iincludedloc
Ejemplo n.º 6
0
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                  OGsIncludedIfAtLeastInDatasets=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Genesloc = deepcopy(Genes)
    if Map is None:
        OGsDatasets = deepcopy(Genes)
        OGs = np.unique(ds.flattenAList(OGsDatasets))  # Unique list of genes (or mapped genes)
        MapNew = None
        MapSpecies = None
    else:
        (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader,
                                                                     OGsFirstColMap, delimGenesInMap)

    L = len(Genesloc)  # Number of datasets
    # Ng = len(OGs)  # Number of unique genes

    GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets])  # GDM: (Ng)x(L) boolean

    # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets
    IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets
    GDM = GDMall[IncludedOGs]
    OGs = OGs[IncludedOGs]
    if MapNew is not None:
        MapNew = MapNew[IncludedOGs]

    Ngs = np.sum(GDM, axis=0)  # Numbers of unique mapped genes in each dataset

    Xnew = np.array([None] * L, dtype=object)
    GenesDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        arelogs = np.nansum(abs(Xloc[l][~isnan(Xloc[l])]) < 30) > 0.98 * ds.numel(Xloc[l][~isnan(Xloc[l])])  # More than 98% of values are below 30.0
        d = Xloc[l].shape[1]  # Number of dimensions (samples) in this dataset
        Xnew[l] = np.zeros([Ngs[l], d], dtype=float)
        GenesDatasets[l] = np.empty(Ngs[l], dtype=object)
        OGsInThisDS = OGs[GDM[:, l]]  # Unique OGs in this dataset
        # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above)
        for ogi in range(len(OGsInThisDS)):
            og = OGsInThisDS[ogi]
            if arelogs:
                Xnew[l][ogi] = np.log2(np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0))
            else:
                Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0)
            GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(OGsDatasets[l], og)])

    return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
Ejemplo n.º 7
0
def combineReplicates(X, replicatesIDs, flipSamples):
    Xloc = np.array(X)
    L = len(Xloc)

    for l in range(L):
        Xtmp = Xloc[l]
        arelogs = np.sum(abs(Xtmp) < 30) > 0.98 * ds.numel(Xtmp)  # More than 98% of values are below 30.0
        if flipSamples is not None and flipSamples[l] is not None and len(flipSamples[l]) == Xtmp.shape[1]:
            if arelogs:
                Xtmp[:, flipSamples[l] == 1] = -Xtmp[:, flipSamples[l] == 1]
            else:
                Xtmp[:, flipSamples[l] == 1] = np.divide(1.0, Xtmp[:, flipSamples[l] == 1])
        uniqueSamples = np.unique(replicatesIDs[l])
        uniqueSamples = uniqueSamples[uniqueSamples != -1]
        Xloc[l] = np.zeros([Xtmp.shape[0], len(uniqueSamples)])
        ss = 0
        for s in range(len(uniqueSamples)):
            if uniqueSamples[s] > -1:
                Xloc[l][:, ss] = np.median(Xtmp[:, replicatesIDs[l] == uniqueSamples[s]], axis=1)
                ss += 1

    return Xloc
Ejemplo n.º 8
0
def generateCoPaM(U,
                  relabel_technique='minmin',
                  w=None,
                  X=None,
                  distCriterion='direct_euc',
                  K=0,
                  GDM=None):
    # Helping functions
    def calwmeans(w):
        wm = [
            np.mean(calwmeans(ww)) if isinstance(ww,
                                                 (list, tuple,
                                                  np.ndarray)) else np.mean(ww)
            for ww in w
        ]
        return np.array(wm)

    def CoPaMsdist(CoPaM1, CoPaM2):
        return np.linalg.norm(CoPaM1 - CoPaM2)

    def orderpartitions(U, method='rand', X=None, GDM=None):
        if method == 'rand':
            return np.random.permutation(range(len(U))), None
        elif method == 'mn':
            # TODO: Implement ranking partitions based on M-N plots
            raise NotImplementedError(
                'Ranking partitions based on the M-N plots logic has not been implemented yet.'
            )
        elif method == 'mse':
            R = len(U)
            mses = np.zeros(R)
            for r in range(R):
                if isinstance(U[r][0][0], (list, tuple, np.ndarray)):
                    mses[r] = np.mean(
                        orderpartitions(U[r], method=method, X=X, GDM=GDM)[1])
                else:
                    mses[r] = np.mean([
                        mn.mseclustersfuzzy(X,
                                            U[r],
                                            donormalise=False,
                                            GDM=GDM)
                    ])
            order = np.argsort(mses)
            return order, mses[order]

    # Fix parameters
    Uloc = ds.listofarrays2arrayofarrays(U)
    R = len(Uloc)
    if GDM is None:
        GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool)
    elif GDM.shape[1] == 1:
        if R > 1:
            GDMloc = np.tile(GDM, [1, R])
        else:
            GDMloc = np.array(GDM)
    else:
        GDMloc = np.array(GDM)
    if w is None or (w is str and w in ['all', 'equal']):
        w = np.ones(R)
    elif ds.numel(w) == 1:
        w = np.array([w for i in range(R)])
    wmeans = calwmeans(w)

    # Work!
    #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0]
    if GDM is None:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0]
    else:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0]
    Uloc = Uloc[permR]
    if GDMloc.shape[1] > 1:
        GDMloc = GDMloc[:, permR]
    wmeans = wmeans[permR]

    if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)):
        Uloc[0] = generateCoPaM(Uloc[0],
                                relabel_technique=relabel_technique,
                                w=w[0],
                                X=X,
                                distCriterion=distCriterion,
                                K=K,
                                GDM=GDMloc)
    #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float)
    CoPaM = np.array(Uloc[0], dtype=float)
    K = CoPaM.shape[1]
    for r in range(1, R):
        if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)):
            Uloc[r] = generateCoPaM(Uloc[r],
                                    relabel_technique=relabel_technique,
                                    w=w[r],
                                    X=X,
                                    distCriterion=distCriterion,
                                    K=K,
                                    GDM=GDMloc)
        if Uloc[r].shape[1] != K:
            raise ValueError(
                'Inequal numbers of clusters in the partition {}.'.format(r))

        Uloc[r] = relabelClusts(CoPaM,
                                Uloc[r],
                                method=relabel_technique,
                                X=X,
                                distCriterion=distCriterion)

        dotprod = np.dot(GDMloc[:, 0:r],
                         wmeans[0:r].transpose())  # (Mxr) * (rx1) = (Mx1)
        CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0],
                                             dotprod[dotprod > 0],
                                             axis=1)
        CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0]
        dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose())
        CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0],
                                           dotprod[dotprod > 0],
                                           axis=1)

    return CoPaM
Ejemplo n.º 9
0
def optimise_tukey_sqrtSCG(B,
                           X,
                           GDM,
                           clustdists=None,
                           smallestClusterSize=11,
                           tails=1,
                           Q3s=2):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustdistsloc = np.ones(K)
    else:
        clustdistsloc = [c for c in clustdists]

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)

    Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap')
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            if k in Cgood:
                csize = np.sum(Bloc[GDM[:, l], k])
                tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                         Cmeans[l][k],
                                         axis=0)
                SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
                gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])

        if ds.numel(SCG[l] > 0):
            if tails == 1:
                Q3 = np.percentile(SCG[l], q=75, axis=0)
                thresh = Q3s * Q3
                SCGouts = SCG[l] > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            elif tails == 2:
                Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0)
                Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0)
                IQR = np.subtract(Q3, Q1)
                thresh = np.add(Q3, 1.5 * IQR)
                SCGouts = np.sqrt(SCG[l]) > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            else:
                raise ValueError(
                    'Invalid number of tails. It should be either 1 or 2.')
        else:
            SCG[l] = np.zeros((1, SCG[l].shape[1]))

    # Clusters mins and maxes (NEW)
    Cmins = np.array([None] * L, dtype=object)
    Cmaxes = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmins[l] = np.zeros([K, Xloc[l].shape[1]])  # K clusters x D dimensions
        Cmaxes[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        for k in range(K):
            Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0)
            Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0)

    # Resolve overlaps between clusters (NEW)
    for k1 in range(K):
        for k2 in range(K):
            # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement
            # guarantees that k2 will always be a later cluster than k1.
            if (k1 >= k2):
                continue
            # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ...
            # the dataset (l) and the dimension (d), at which this overlap is found
            # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min
            # to max) range of one cluster is within the range of the other cluster. This is the worse overlap.
            # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other
            # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop
            v_smallestoverlap = 0
            l_smallestoverlap = -1
            d_smallestoverlap = -1
            t_smallestoverlap = -1  # Overlap type, read above
            for l in range(L):
                Nd = len(Cmins[l][k1])  # Dimensions in this dataset
                for d in range(Nd):
                    x1 = Cmaxes[l][k1][d]
                    x2 = Cmaxes[l][k2][d]
                    n1 = Cmins[l][k1][d]
                    n2 = Cmins[l][k2][d]
                    if (x1 > n2 and x1 <= x2):
                        if (n1 < n2):
                            ov = x1 - n2
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 0
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    elif (x2 > n1 and x2 <= x1):
                        if (n2 < n1):
                            ov = x2 - n1
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 1
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    else:
                        t_smallestoverlap = 2
                        continue  # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue
                if (t_smallestoverlap == 2):
                    continue  # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue

            # Sort out the overlap if exists between k1 and k2
            if (t_smallestoverlap == -1):
                # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2).
                # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0),
                # so effectively no gene will ever be mapped to it!
                Cmins[0][k2][0] = 1
                Cmaxes[0][k2][0] = 0
            elif (t_smallestoverlap == 0):
                Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon
            elif (t_smallestoverlap == 1):
                Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon

    # Find who belongs (NEW)
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            tmp1 = nu.largerthanaxis(Xloc[l],
                                     Cmins[l][k],
                                     axis=0,
                                     orequal=True)
            tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True)
            belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2),
                                              axis=1)

    # # Helping function (OLD - to be removed)
    # def iswithinworse(ref, x):
    #     return x <= np.max(ref)
    #
    # # Find who belongs (OLD - to be removed)
    # belongs = np.ones([Ng, K, L], dtype=bool)  # Ng genes x K clusters x L datasets
    # for l in range(L):
    #     for k in range(K):
    #         for d in range(Xloc[l].shape[1]):
    #             tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
    #             belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere (OLD - to be removed)
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED)
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out
Ejemplo n.º 10
0
def isnormal_68_95_99p7_rule(X):
    """
    Test if data is normally distributed by checking the percentages of values below different stds away from the mean
    This is not fully implemented and is not used in the current version of the method
    :param X: Dataset matrix (numpy array)
    :return:
    """
    n = ds.numel(X)
    m = np.mean(X)
    s = np.std(X)

    bins = np.linspace(np.min(X), np.max(X), 100)
    d = np.digitize(np.concatenate(X), bins)
    xd = bins[d-1]
    mode = spst.mode(xd)[0]

    # Find the percentage of elements less than these seven values
    m3s = percentage_less_than(X, m - 3 * s)  # mean minus 3s (theory ~= N(0.0013, s=0.0315/sqrt(n)))
    m2s = percentage_less_than(X, m - 2 * s)  # mean minus 2s (theory ~= N(0.0228, s=0.1153/sqrt(n)))
    m1s = percentage_less_than(X, m - 1 * s)  # mean minus 1s (theory ~= N(0.1587, s=0.2116/sqrt(n)))
    p0s = percentage_less_than(X, m)  # mean (theory ~= N(0.5000, s=0.3013/sqrt(n)))
    p1s = percentage_less_than(X, m + 1 * s)  # mean plus 1s (theory ~= N(0.8413, s=0.2116/sqrt(n)))
    p2s = percentage_less_than(X, m + 2 * s)  # mean plus 2s (theory ~= N(0.9772, s=0.1153/sqrt(n)))
    p3s = percentage_less_than(X, m + 3 * s)  # mean plus 3s (theory ~= N(0.9987, s=0.0315/sqrt(n)))
    md = percentage_less_than(X, mode)  # mode (theory ~= N(0.9987, s=0.0315/sqrt(n)))

    # How were these theoretical distributions calculated??
    # The distributions of these stds were found empirically by calculating them from 1000x26 randomly generated
    # normally distributed numbers ~N(0.0, 1.0). 26 different population sizes were considered "round(10.^(1:0.2:6))",
    # at each population size, 1000 random populations were generated. It was observed that at a fixed population size,
    # the percentages of elements less than (m-3*s) or (m-2*s) ... (etc.) were normally distributed with an average
    # equal to the expected CDF at (m-3*s) or (m-2*s) ... (etc.) and with a standard deviation that is inversely
    # linearly proportional to the square root of the size of the population. The empirical values were calculated from
    # this experiment and are included above. For example: the percentage of elements that are less than (m-2*s) in a
    # population of n elements is expected to be 0.0228 (2.28%) with a standard deviation of 0.1587/sqrt(n).
    # This empirical test was run on MATLAB

    # Calculate one-tailed p-values for the seven values above based on normal distributions
    pv = np.array([i*1.0 for i in range(8)])
    diff = np.array([i*1.0 for i in range(8)])

    pv[0] = 1-2*spst.norm.cdf(-abs(m3s-0.0013), loc=0, scale=0.0315/math.sqrt(n))
    diff[0] = abs(m3s-0.0013)

    pv[1] = 1-2*spst.norm.cdf(-abs(m2s-0.0228), loc=0, scale=0.1153/math.sqrt(n))
    diff[1] = abs(m2s-0.0228)

    pv[2] = 1-2*spst.norm.cdf(-abs(m1s-0.1587), loc=0, scale=0.2116/math.sqrt(n))
    diff[2] = abs(m1s-0.1587)

    pv[3] = 1-2*spst.norm.cdf(-abs(p0s-0.5000), loc=0, scale=0.3013/math.sqrt(n))
    diff[3] = abs(p0s-0.5000)

    pv[4] = 1-2*spst.norm.cdf(-abs(p1s-0.8413), loc=0, scale=0.2116/math.sqrt(n))
    diff[4] = abs(p1s-0.8413)

    pv[5] = 1-2*spst.norm.cdf(-abs(p2s-0.9772), loc=0, scale=0.1153/math.sqrt(n))
    diff[5] = abs(p2s-0.9772)

    pv[6] = 1-2*spst.norm.cdf(-abs(p3s-0.9987), loc=0, scale=0.0315/math.sqrt(n))
    diff[6] = abs(p3s-0.9987)

    pv[7] = 1 - 2 * spst.norm.cdf(-abs(md - 0.5000), loc=0, scale=0.3013 / math.sqrt(n))
    diff[7] = abs(md - 0.5000)

    return np.mean(np.log10(pv)), np.mean(diff)
Ejemplo n.º 11
0
def normaliseSampleFeatureMat(X, type):
    """
    X = normalizeSampleFeatureMat(X, type)

    type: 0 (none), 1 (divide by mean), 2 (divide by the first),
        3 (take log2), 31 (take log2 after setting all values < 1.0 to 1.0, i.e. guarantee positive log),
        4 (subtract the mean and divide by the std),
        5 (divide by the sum), 6 (subtract the mean),
        7 (divide by the max), 8 (2 to the power X), 9 (subtract the min),
        10 (rank: 1 for lowest, then 2, 3, ...; average on ties),
        11 (rank, like 10 but order arbitrarly on ties),
        12 (normalise to the [0 1] range),
        13 (Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows)

        101 (quantile), 102 (subtract columns (samples) means),
        103 (subtract global mean)

        1000 (Automatically detect normalisation)

    If (type) was a vector like [3 1], this means to apply normalisation
    type (3) over (X) then to apply type (1) over the result. And so on.

    :param X:
    :param type:
    :return:
    """
    Xout = np.array(X)
    if ds.numel(Xout) == 0:
        return Xout, [0]  # If array is empty
    codes = np.array(type)  # stays as input types unless auto-normalisation (type 1000) changes it

    if isinstance(type, (list, tuple, np.ndarray)):
        # This has a reason, which is if there is a single type (1000), it will replace it with the actual codes
        j = 0
        for i in range(len(type)):
            Xout, codesi = normaliseSampleFeatureMat(Xout, type[i])
            if isinstance(codesi, (list, tuple, np.ndarray)) & codesi.ndim > 0:
                codes[j] = codesi[0]
                codes = np.insert(codes, j+1, codesi[1:])
                j = j + len(codesi)
            else:
                j = j + 1
        return Xout, codes

    if type == 1:
        # 1: Divide by the mean
        Xout = nu.divideaxis(Xout, np.mean(Xout, axis=1), 1)

    if type == 2:
        # 2: Divide by the first value
        Xout = nu.divideaxis(Xout, Xout[:, 1], 1)

    if type == 3:
        # 3: Take log2
        Xout[Xout <= 0] = float('nan')
        Xout = np.log2(Xout)
        ind1 = np.any(isnan(Xout), axis=1)
        Xout[ind1] = fixnans(Xout[ind1])

    if type == 31:
        # 31: Set all values < 1 to 1 then take log (guarantee a positive log)
        Xout[Xout <= 1] = 1
        Xout = np.log2(Xout)

    if type == 4:
        # 4: Subtract the mean and divide by the std
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)
        ConstGenesIndices = np.std(Xout, axis=1) == 0
        Xout = nu.divideaxis(Xout, np.std(Xout, axis=1), axis=1)
        Xout[ConstGenesIndices] = 0

    if type == 5:
        # 5: Divide by the sum
        Xout = nu.divideaxis(Xout, np.sum(Xout, axis=1), axis=1)

    if type == 6:
        # 6: Subtract the mean
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=1), axis=1)

    if type == 7:
        # 7: Divide by the maximum
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 8:
        # 8: (2 to the power X)
        Xout = np.power(2, Xout)

    if type == 9:
        # 9: Subtract the min
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)

    if type == 10:
        # 10: Rank: 0 for lowest, then 1, 2, ...; average on ties
        Xout = spmstats.rankdata(Xout, axis=0) - 1

    if type == 11:
        # 11: Rank: 0 for lowest, then 1, 2, ...; arbitrary order on ties
        Xout = np.argsort(np.argsort(Xout, axis=0), axis=0)

    if type == 12:
        # 12: Normalise to the [0 1] range
        Xout = nu.subtractaxis(Xout, np.min(Xout, axis=1), axis=1)
        Xout = nu.divideaxis(Xout, np.max(Xout, axis=1), axis=1)

    if type == 13:
        # 13: Genes with low values everywhere are set to zeros; bimodel distribution is fit to maxima of rows
        Xout = filterBimodal(X)

    # 100s
    if type == 101:
        # 101: quantile
        av = np.mean(np.sort(Xout, axis=0), axis=1)
        II = np.argsort(np.argsort(Xout, axis=0), axis=0)
        Xout = av[II]

    if type == 102:
        # 102: subtract the mean of each sample (column) from it
        Xout = nu.subtractaxis(Xout, np.mean(Xout, axis=0), axis=0)

    if type == 103:
        # 103: subtract the global mean of the data
        Xout -= np.mean(Xout)

    if type == 1000:
        # 1000: automatically detect normalisation
        codes = autoNormalise(Xout)
        Xout = normaliseSampleFeatureMat(Xout, codes)[0]
        codes = np.append([101], codes)


    return Xout, codes
Ejemplo n.º 12
0
def arelogs_function(X):
    I = np.bitwise_and(~isnan(X), X>0)
    return np.nansum(abs(X[I]) < 30) > 0.98 * ds.numel(X[I])  # More than 98% of values are below 30.0
Ejemplo n.º 13
0
def mnplotsgreedy(X,
                  B,
                  type='A',
                  params=None,
                  allMSE=None,
                  tightnessweight=1,
                  setsP=None,
                  setsN=None,
                  Xtype='data',
                  mseCache=None,
                  wsets=None,
                  GDM=None,
                  msesummary='average',
                  percentageOfClustersKept=100,
                  smallestClusterSize=11,
                  Xnames=None,
                  ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Bloc = ds.reduceToArrayOfNDArraysAsObjects(B, 2)
    L = Xloc.shape[0]  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = Xloc.shape[0]
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        Ng = np.shape(GDM)[0]
        GDMloc = GDM[:, setsPN]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    # Put all clusters in one matrix
    N = Bloc.shape[0]  # Number of partitions
    K = [Bloc[i].shape[1]
         for i in range(N)]  # Number of clusters in each partition

    # One big matrix for all clusters
    BB = Bloc[0]
    for n in range(1, N):
        BB = np.append(BB, Bloc[n], axis=1)
    VMc = np.sum(BB, axis=0)
    NN = len(VMc)  # Total number of clusters

    # Return a basic output if there are no input clusters
    if ds.numel(GDMloc) == 0:
        params = dict(
            params, **{
                'tightnessweight': tightnessweight,
                'msesummary': msesummary,
                'percentageofclusterskept': percentageOfClustersKept,
                'smallestclustersize': smallestClusterSize
            })

        MNResults = collections.namedtuple('MNResults', [
            'B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball',
            'params'
        ])
        B_out = np.empty([0, 0])
        I = np.empty([0, 0])
        allVecs = np.empty([0, 0])
        allDists = np.empty([0, 0])
        return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB,
                         params)

    # Fill Vmse if not provided
    if mseCache is None and allMSE is None:
        # Cache all mse values
        mseCache = np.zeros([NN, L])
        io.resetparallelprogress(NN * L)
        for l in range(L):
            if Xtype == 'files':
                # load files here
                raise NotImplementedError(
                    'Xtype "files" has not been implemented yet.')
            elif Xtype == 'data':
                Xtmp = Xloc[l]
            else:
                raise ValueError(
                    'Xtype has to be "files" or "data". The given Xtype is invalid.'
                )

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                mseCachetmp = Parallel(n_jobs=ncores)\
                    (delayed(mseclusters)
                     (Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0) for nn in range(NN))
                mseCachetmp = [mm[0] for mm in mseCachetmp]
                for nn in range(NN):
                    mseCache[nn, l] = mseCachetmp[nn]

                gc.collect()

                #io.updateparallelprogress(NN)
            '''
            for nn in range(NN):
                mseCache[nn, l] = mseclusters(Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0)[0]
            io.log('Done cluster evaluation for {0} have been calculated.'.format(Xnames[l]))
            '''

    # Calculate allMSE if needed (Nx1)
    if allMSE is None:
        if type == 'A':
            wsetsloc = wsets[setsPN]
            wsetsloc = [float(n) / sum(wsetsloc) for n in wsetsloc]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsPN], wsets)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsPN], wsets),
                                axis=1)
            else:
                raise ValueError(
                    'msesummary value has to be "average", "mean", "worse", or "max".',
                    ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.'
                )
        elif type == 'B':
            wsetsP = wsets[setsP]
            wsetsP = [n / sum(wsetsP) for n in wsetsP]
            wsetsN = wsets[setsN]
            wsetsN = [n / sum(wsetsN) for n in wsetsN]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsP], wsetsP) - np.dot(
                    mseCache[:, setsN], wsetsN)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsP], wsetsP), axis=1) \
                         - np.max(np.multiply(mseCache[:, setsN], wsetsN), axis=1)
            else:
                raise ValueError(
                    'msesummary value has to be "average", "mean", "worse", or "max".',
                    ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.'
                )
        else:
            raise ValueError(
                'Type should be either A or B; given type is invalid.')

    # Find the distances
    maxx = np.max(allMSE[~np.isnan(allMSE)])
    minx = np.min(allMSE[~np.isnan(allMSE)])
    maxy = np.log10(np.max(VMc))
    miny = 0
    with np.errstate(divide='ignore'):
        allVecs = np.concatenate(
            ([(allMSE - minx) / (maxx - minx)], [(np.log10(VMc) - miny) /
                                                 (maxy - miny)]),
            axis=0).transpose()
    allVecs[:, 0] *= tightnessweight
    allDists = np.array([
        np.sqrt(1.1 + np.power(tightnessweight, 2))
        if np.any(np.isnan(n)) else sp.spatial.distance.euclidean(n, [0, 1])
        for n in allVecs
    ])
    alpha = 0.0001
    tmp, uVdsI = np.unique(allDists, return_index=True)
    while len(uVdsI) != len(allDists):
        for n in range(len(allDists)):
            if n not in uVdsI:
                allDists[n] += alpha * sp.random.normal()
        tmp, uVdsI = np.unique(allDists, return_index=True)

    # Helper function for greedy solution below
    def mngreedy(Bloc, I, Vds, iter=float('inf')):
        Vdsloc = np.array(Vds)
        res = np.array([False for n in Vdsloc])
        if iter == 0 or not any(I):
            return res
        for n in range(len(I)):
            if not I[n]:
                Vdsloc[n] = float('inf')
        p = np.argmin(Vdsloc)
        res[p] = True
        #II = I
        overlaps = np.dot(
            ds.matlablike_index2D(Bloc, 'all', p).transpose(), Bloc) > 0
        I &= ~overlaps
        return res | mngreedy(Bloc, I, Vdsloc, iter - 1)

    # ** Find greedy solution **
    # Sort clusters based on distances (not important, but benefits the output)
    II = np.argsort(allDists)
    allDists = allDists[II]
    BB = ds.matlablike_index2D(BB, 'a', II)
    allVecs = ds.matlablike_index2D(allVecs, II, 'a')
    allMSE = allMSE[II]
    mseCache = ds.matlablike_index2D(mseCache, II, 'a')
    VMc = VMc[II]

    # include the top XX% of the clusters that have at least smallestClusterSize
    Ismall = VMc < smallestClusterSize
    Inans = np.isnan(allDists)
    tmpDists = [
        np.max(allDists) if Inans[n] | Ismall[n] else allDists[n]
        for n in range(len(allDists))
    ]
    percentageOfClustersKept *= float(np.sum(~Ismall)) / len(allDists)
    Iincluded = (tmpDists <= np.percentile(
        tmpDists, percentageOfClustersKept)) & (np.bitwise_not(Ismall))
    I = mngreedy(BB, Iincluded, allDists)
    B_out = ds.matlablike_index2D(BB, 'a', I)

    # Prepare and return the results:
    params = dict(
        params, **{
            'tightnessweight': tightnessweight,
            'msesummary': msesummary,
            'percentageofclusterskept': percentageOfClustersKept,
            'smallestclustersize': smallestClusterSize
        })

    MNResults = collections.namedtuple('MNResults', [
        'B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball', 'params'
    ])
    return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB, params)