Ejemplo n.º 1
0
def fuzzystretch(X, x0=None):
    Xloc = np.array(X)
    if x0 is None:
        x0 = np.array([np.mean(xrow[xrow > 0]) for xrow in Xloc])
        x0[x0 == 1] = 0.5
    elif ds.numel(x0) == 1:
        x0 = np.array([x0 for i in range(Xloc.shape[0])])
    elif ds.numel(x0) != Xloc.shape[0]:
        raise ValueError(
            'The parameter x0 should either be a single value or a vector of length equal to the number '
            'of rows in X. It can also be left ungiven as it has a default value.'
        )

    y = np.zeros(Xloc.shape)
    for i in range(Xloc.shape[0]):
        xrow = Xloc[i]
        xt = xrow
        xt[xrow < x0[i]] = (np.pi * xrow[xrow < x0[i]]) / (2 *
                                                           x0[i]) - np.pi / 2
        xt[xrow >= x0[i]] = (xrow[xrow >= x0[i]] -
                             x0[i]) * np.pi / (2 * (1 - x0[i]))

        yt = np.zeros(len(xt))
        yt[xrow < x0[i]] = x0[i] + x0[i] * np.sin(xt[xrow < x0[i]])
        yt[xrow >= x0[i]] = x0[i] + (1 - x0[i]) * np.sin(xt[xrow >= x0[i]])

        y[i] = yt

    return y
Ejemplo n.º 2
0
def combineReplicates(X, replicatesIDs, flipSamples):
    Xloc = np.array(X)
    L = len(Xloc)

    for l in range(L):
        Xtmp = Xloc[l]
        arelogs = np.sum(abs(Xtmp) < 30) > 0.98 * ds.numel(
            Xtmp)  # More than 98% of values are below 30.0
        if flipSamples is not None and flipSamples[l] is not None and len(
                flipSamples[l]) == Xtmp.shape[1]:
            if arelogs:
                Xtmp[:, flipSamples[l] == 1] = -Xtmp[:, flipSamples[l] == 1]
            else:
                Xtmp[:, flipSamples[l] == 1] = np.divide(
                    1.0, Xtmp[:, flipSamples[l] == 1])
        uniqueSamples = np.unique(replicatesIDs[l])
        uniqueSamples = uniqueSamples[uniqueSamples != -1]
        Xloc[l] = np.zeros([Xtmp.shape[0], len(uniqueSamples)])
        ss = 0
        for s in range(len(uniqueSamples)):
            if uniqueSamples[s] > -1:
                Xloc[l][:, ss] = np.median(
                    Xtmp[:, replicatesIDs[l] == uniqueSamples[s]], axis=1)
                ss += 1

    return Xloc
Ejemplo n.º 3
0
def calculateGDMandUpdateDatasets(X,
                                  Genes,
                                  Map=None,
                                  mapheader=True,
                                  OGsFirstColMap=True,
                                  delimGenesInMap='\\W+',
                                  OGsIncludedIfAtLeastInDatasets=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Genesloc = deepcopy(Genes)
    if Map is None:
        OGsDatasets = deepcopy(Genes)
        OGs = np.unique(ds.flattenAList(
            OGsDatasets))  # Unique list of genes (or mapped genes)
        MapNew = None
        MapSpecies = None
    else:
        (OGs, OGsDatasets, MapNew,
         MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader,
                                           OGsFirstColMap, delimGenesInMap)

    L = len(Genesloc)  # Number of datasets
    # Ng = len(OGs)  # Number of unique genes

    GDMall = np.transpose([np.in1d(OGs, gs)
                           for gs in OGsDatasets])  # GDM: (Ng)x(L) boolean

    # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets
    IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets
    GDM = GDMall[IncludedOGs]
    OGs = OGs[IncludedOGs]
    if MapNew is not None:
        MapNew = MapNew[IncludedOGs]

    Ngs = np.sum(GDM, axis=0)  # Numbers of unique mapped genes in each dataset

    Xnew = np.array([None] * L, dtype=object)
    GenesDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        arelogs = np.nansum(
            abs(Xloc[l][~isnan(Xloc[l])]) < 30
        ) > 0.98 * ds.numel(
            Xloc[l][~isnan(Xloc[l])])  # More than 98% of values are below 30.0
        d = Xloc[l].shape[1]  # Number of dimensions (samples) in this dataset
        Xnew[l] = np.zeros([Ngs[l], d], dtype=float)
        GenesDatasets[l] = np.empty(Ngs[l], dtype=object)
        OGsInThisDS = OGs[GDM[:, l]]  # Unique OGs in this dataset
        # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above)
        for ogi in range(len(OGsInThisDS)):
            og = OGsInThisDS[ogi]
            if arelogs:
                Xnew[l][ogi] = np.log2(
                    np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]),
                           axis=0))
            else:
                Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)],
                                      axis=0)
            GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(
                OGsDatasets[l], og)])

    return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
Ejemplo n.º 4
0
def autoNormalise(X):
    """
    Automatically normalise dataset X and filter it if needed

    :param X: Dataset matrix (numpy array)
    :return: array of normalisation codes
    """
    Xloc = np.array(X)

    twosided = np.sum(Xloc < 0) > 0.2 * np.sum(
        Xloc > 0)  # negative values are at least 20% of positive values
    alreadylogs = np.sum(abs(Xloc) < 30) > 0.98 * ds.numel(
        Xloc)  # More than 98% of values are below 30.0

    if twosided:
        return np.array([6])
        #return np.array([101, 4])
    else:
        Xloc[isnan(Xloc)] = 0.0
        Xloc[Xloc < 0] = 0.0
        if alreadylogs:
            Xf = normaliseSampleFeatureMat(Xloc, [13])[0]
            if isnormal_68_95_99p7_rule(Xf)[1] < isnormal_68_95_99p7_rule(
                    Xloc)[1]:
                return np.array([13, 4])
            else:
                return np.array([4])
        else:
            Xl = normaliseSampleFeatureMat(
                Xloc, [3])[0]  # index 1  (Xloc, i.e. original X is index 0)
            Xlp = normaliseSampleFeatureMat(Xloc, [31])[0]  # index 2
            Xf = normaliseSampleFeatureMat(Xloc, [13])[0]  # index 3
            Xlf = normaliseSampleFeatureMat(Xl, [13])[0]  # index 4
            Xlpf = normaliseSampleFeatureMat(Xlp, [13])[0]  # index 5
            isnormal_stats = [
                isnormal_68_95_99p7_rule(Xloc)[1],
                isnormal_68_95_99p7_rule(Xl)[1],
                isnormal_68_95_99p7_rule(Xlp)[1],
                isnormal_68_95_99p7_rule(Xf)[1],
                isnormal_68_95_99p7_rule(Xlf)[1],
                isnormal_68_95_99p7_rule(Xlpf)[1]
            ]
            most_normal_index = np.argmin(isnormal_stats)
            if most_normal_index == 0:
                return np.array([4])
            elif most_normal_index == 1:
                return np.array([3, 4])
            elif most_normal_index == 2:
                return np.array([31, 4])
            elif most_normal_index == 3:
                return np.array([13, 4])
            elif most_normal_index == 4:
                return np.array([3, 13, 4])
            elif most_normal_index == 5:
                return np.array([31, 13, 4])
            else:
                raise ValueError(
                    'You should never reach this error. Please contact {0}'.
                    format(glob.email))
Ejemplo n.º 5
0
def percentage_less_than(X, v):
    """
    Percentage of elements in matrix X that are less than the value v
    :param X: Matrix of numbers (numpy array)
    :param v: A value to be compared with
    :return: A percentage in the range [0.0, 1.0]
    """
    return np.sum(X < v) * 1.0 / ds.numel(X)
Ejemplo n.º 6
0
def isnan(X):
    if ds.numel(X) == 1:
        return math.isnan(X)
    elif len(np.shape(X)) == 1:
        res = np.zeros(np.shape(X), dtype=bool)
        for i in range(len(X)):
            res[i] = math.isnan(X[i])
        return res
    elif len(np.shape(X)) == 2:
        res = np.zeros(np.shape(X), dtype=bool)
        for i in range(np.size(X, 0)):
            for j in range(np.size(X, 1)):
                res[i, j] = math.isnan(X[i, j])
        return res
Ejemplo n.º 7
0
def isnormal_68_95_99p7_rule(X):
    """
    Test if data is normally distributed by checking the percentages of values below different stds away from the mean
    This is not fully implemented and is not used in the current version of the method
    :param X: Dataset matrix (numpy array)
    :return:
    """
    n = ds.numel(X)
    m = np.mean(X)
    s = np.std(X)

    bins = np.linspace(np.min(X), np.max(X), 100)
    d = np.digitize(np.concatenate(X), bins)
    xd = bins[d - 1]
    mode = spst.mode(xd)[0]

    # Find the percentage of elements less than these seven values
    m3s = percentage_less_than(
        X, m - 3 * s)  # mean minus 3s (theory ~= N(0.0013, s=0.0315/sqrt(n)))
    m2s = percentage_less_than(
        X, m - 2 * s)  # mean minus 2s (theory ~= N(0.0228, s=0.1153/sqrt(n)))
    m1s = percentage_less_than(
        X, m - 1 * s)  # mean minus 1s (theory ~= N(0.1587, s=0.2116/sqrt(n)))
    p0s = percentage_less_than(
        X, m)  # mean (theory ~= N(0.5000, s=0.3013/sqrt(n)))
    p1s = percentage_less_than(
        X, m + 1 * s)  # mean plus 1s (theory ~= N(0.8413, s=0.2116/sqrt(n)))
    p2s = percentage_less_than(
        X, m + 2 * s)  # mean plus 2s (theory ~= N(0.9772, s=0.1153/sqrt(n)))
    p3s = percentage_less_than(
        X, m + 3 * s)  # mean plus 3s (theory ~= N(0.9987, s=0.0315/sqrt(n)))
    md = percentage_less_than(
        X, mode)  # mode (theory ~= N(0.9987, s=0.0315/sqrt(n)))

    # How were these theoretical distributions calculated??
    # The distributions of these stds were found empirically by calculating them from 1000x26 randomly generated
    # normally distributed numbers ~N(0.0, 1.0). 26 different population sizes were considered "round(10.^(1:0.2:6))",
    # at each population size, 1000 random populations were generated. It was observed that at a fixed population size,
    # the percentages of elements less than (m-3*s) or (m-2*s) ... (etc.) were normally distributed with an average
    # equal to the expected CDF at (m-3*s) or (m-2*s) ... (etc.) and with a standard deviation that is inversely
    # linearly proportional to the square root of the size of the population. The empirical values were calculated from
    # this experiment and are included above. For example: the percentage of elements that are less than (m-2*s) in a
    # population of n elements is expected to be 0.0228 (2.28%) with a standard deviation of 0.1587/sqrt(n).
    # This empirical test was run on MATLAB

    # Calculate one-tailed p-values for the seven values above based on normal distributions
    pv = np.array([i * 1.0 for i in range(8)])
    diff = np.array([i * 1.0 for i in range(8)])

    pv[0] = 1 - 2 * spst.norm.cdf(
        -abs(m3s - 0.0013), loc=0, scale=0.0315 / math.sqrt(n))
    diff[0] = abs(m3s - 0.0013)

    pv[1] = 1 - 2 * spst.norm.cdf(
        -abs(m2s - 0.0228), loc=0, scale=0.1153 / math.sqrt(n))
    diff[1] = abs(m2s - 0.0228)

    pv[2] = 1 - 2 * spst.norm.cdf(
        -abs(m1s - 0.1587), loc=0, scale=0.2116 / math.sqrt(n))
    diff[2] = abs(m1s - 0.1587)

    pv[3] = 1 - 2 * spst.norm.cdf(
        -abs(p0s - 0.5000), loc=0, scale=0.3013 / math.sqrt(n))
    diff[3] = abs(p0s - 0.5000)

    pv[4] = 1 - 2 * spst.norm.cdf(
        -abs(p1s - 0.8413), loc=0, scale=0.2116 / math.sqrt(n))
    diff[4] = abs(p1s - 0.8413)

    pv[5] = 1 - 2 * spst.norm.cdf(
        -abs(p2s - 0.9772), loc=0, scale=0.1153 / math.sqrt(n))
    diff[5] = abs(p2s - 0.9772)

    pv[6] = 1 - 2 * spst.norm.cdf(
        -abs(p3s - 0.9987), loc=0, scale=0.0315 / math.sqrt(n))
    diff[6] = abs(p3s - 0.9987)

    pv[7] = 1 - 2 * spst.norm.cdf(
        -abs(md - 0.5000), loc=0, scale=0.3013 / math.sqrt(n))
    diff[7] = abs(md - 0.5000)

    return np.mean(np.log10(pv)), np.mean(diff)
Ejemplo n.º 8
0
def generateCoPaM(U,
                  relabel_technique='minmin',
                  w=None,
                  X=None,
                  distCriterion='direct_euc',
                  K=0,
                  GDM=None):
    # Helping functions
    def calwmeans(w):
        wm = [
            np.mean(calwmeans(ww)) if isinstance(ww,
                                                 (list, tuple,
                                                  np.ndarray)) else np.mean(ww)
            for ww in w
        ]
        return np.array(wm)

    def CoPaMsdist(CoPaM1, CoPaM2):
        return np.linalg.norm(CoPaM1 - CoPaM2)

    def orderpartitions(U, method='rand', X=None, GDM=None):
        if method == 'rand':
            return np.random.permutation(range(len(U))), None
        elif method == 'mn':
            # TODO: Implement ranking partitions based on M-N plots
            raise NotImplementedError(
                'Ranking partitions based on the M-N plots logic has not been implemented yet.'
            )
        elif method == 'mse':
            R = len(U)
            mses = np.zeros(R)
            for r in range(R):
                if isinstance(U[r][0][0], (list, tuple, np.ndarray)):
                    mses[r] = np.mean(
                        orderpartitions(U[r], method=method, X=X, GDM=GDM)[1])
                else:
                    mses[r] = np.mean([
                        mn.mseclustersfuzzy(X,
                                            U[r],
                                            donormalise=False,
                                            GDM=GDM)
                    ])
            order = np.argsort(mses)
            return order, mses[order]

    # Fix parameters
    Uloc = ds.listofarrays2arrayofarrays(U)
    R = len(Uloc)
    if GDM is None:
        GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool)
    elif GDM.shape[1] == 1:
        if R > 1:
            GDMloc = np.tile(GDM, [1, R])
        else:
            GDMloc = np.array(GDM)
    else:
        GDMloc = np.array(GDM)
    if w is None or (w is str and w in ['all', 'equal']):
        w = np.ones(R)
    elif ds.numel(w) == 1:
        w = np.array([w for i in range(R)])
    wmeans = calwmeans(w)

    # Work!
    #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0]
    if GDM is None:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0]
    else:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0]
    Uloc = Uloc[permR]
    if GDMloc.shape[1] > 1:
        GDMloc = GDMloc[:, permR]
    wmeans = wmeans[permR]

    if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)):
        Uloc[0] = generateCoPaM(Uloc[0],
                                relabel_technique=relabel_technique,
                                w=w[0],
                                X=X,
                                distCriterion=distCriterion,
                                K=K,
                                GDM=GDMloc)
    #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float)
    CoPaM = np.array(Uloc[0], dtype=float)
    K = CoPaM.shape[1]
    for r in range(1, R):
        if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)):
            Uloc[r] = generateCoPaM(Uloc[r],
                                    relabel_technique=relabel_technique,
                                    w=w[r],
                                    X=X,
                                    distCriterion=distCriterion,
                                    K=K,
                                    GDM=GDMloc)
        if Uloc[r].shape[1] != K:
            raise ValueError(
                'Inequal numbers of clusters in the partition {}.'.format(r))

        Uloc[r] = relabelClusts(CoPaM,
                                Uloc[r],
                                method=relabel_technique,
                                X=X,
                                distCriterion=distCriterion)

        dotprod = np.dot(GDMloc[:, 0:r],
                         wmeans[0:r].transpose())  # (Mxr) * (rx1) = (Mx1)
        CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0],
                                             dotprod[dotprod > 0],
                                             axis=1)
        CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0]
        dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose())
        CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0],
                                           dotprod[dotprod > 0],
                                           axis=1)

    return CoPaM
Ejemplo n.º 9
0
def optimise_tukey_sqrtSCG(B,
                           X,
                           GDM,
                           clustdists=None,
                           smallestClusterSize=11,
                           tails=1,
                           Q3s=2):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustdistsloc = np.ones(K)
    else:
        clustdistsloc = [c for c in clustdists]

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)

    Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap')
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            if k in Cgood:
                csize = np.sum(Bloc[GDM[:, l], k])
                tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                         Cmeans[l][k],
                                         axis=0)
                SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
                gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])

        if ds.numel(SCG[l] > 0):
            if tails == 1:
                Q3 = np.percentile(SCG[l], q=75, axis=0)
                thresh = Q3s * Q3
                SCGouts = SCG[l] > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            elif tails == 2:
                Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0)
                Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0)
                IQR = np.subtract(Q3, Q1)
                thresh = np.add(Q3, 1.5 * IQR)
                SCGouts = np.sqrt(SCG[l]) > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            else:
                raise ValueError(
                    'Invalid number of tails. It should be either 1 or 2.')
        else:
            SCG[l] = np.zeros((1, SCG[l].shape[1]))

    # Clusters mins and maxes (NEW)
    Cmins = np.array([None] * L, dtype=object)
    Cmaxes = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmins[l] = np.zeros([K, Xloc[l].shape[1]])  # K clusters x D dimensions
        Cmaxes[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        for k in range(K):
            Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0)
            Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0)

    # Resolve overlaps between clusters (NEW)
    for k1 in range(K):
        for k2 in range(K):
            # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement
            # guarantees that k2 will always be a later cluster than k1.
            if (k1 >= k2):
                continue
            # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ...
            # the dataset (l) and the dimension (d), at which this overlap is found
            # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min
            # to max) range of one cluster is within the range of the other cluster. This is the worse overlap.
            # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other
            # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop
            v_smallestoverlap = 0
            l_smallestoverlap = -1
            d_smallestoverlap = -1
            t_smallestoverlap = -1  # Overlap type, read above
            for l in range(L):
                Nd = len(Cmins[l][k1])  # Dimensions in this dataset
                for d in range(Nd):
                    x1 = Cmaxes[l][k1][d]
                    x2 = Cmaxes[l][k2][d]
                    n1 = Cmins[l][k1][d]
                    n2 = Cmins[l][k2][d]
                    if (x1 > n2 and x1 <= x2):
                        if (n1 < n2):
                            ov = x1 - n2
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 0
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    elif (x2 > n1 and x2 <= x1):
                        if (n2 < n1):
                            ov = x2 - n1
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 1
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    else:
                        t_smallestoverlap = 2
                        continue  # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue
                if (t_smallestoverlap == 2):
                    continue  # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue

            # Sort out the overlap if exists between k1 and k2
            if (t_smallestoverlap == -1):
                # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2).
                # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0),
                # so effectively no gene will ever be mapped to it!
                Cmins[0][k2][0] = 1
                Cmaxes[0][k2][0] = 0
            elif (t_smallestoverlap == 0):
                Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon
            elif (t_smallestoverlap == 1):
                Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon

    # Find who belongs (NEW)
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            tmp1 = nu.largerthanaxis(Xloc[l],
                                     Cmins[l][k],
                                     axis=0,
                                     orequal=True)
            tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True)
            belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2),
                                              axis=1)

    # # Helping function (OLD - to be removed)
    # def iswithinworse(ref, x):
    #     return x <= np.max(ref)
    #
    # # Find who belongs (OLD - to be removed)
    # belongs = np.ones([Ng, K, L], dtype=bool)  # Ng genes x K clusters x L datasets
    # for l in range(L):
    #     for k in range(K):
    #         for d in range(Xloc[l].shape[1]):
    #             tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
    #             belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere (OLD - to be removed)
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED)
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out