Ejemplo n.º 1
0
def readDataFromFiles(datafiles,
                      delimiter='\t| |, |; |,|;',
                      dtype=float,
                      skiprows=1,
                      data_na_filter=True,
                      skipcolumns=1,
                      returnSkipped=True,
                      comm='#'):
    L = len(datafiles)
    X = [None] * L
    skippedRows = [None] * L
    skippedCols = [None] * L
    for l in range(L):
        with open(datafiles[l]) as f:
            ncols = len(re.split(delimiter, f.readline()))
        # This is now using pandas read_csv, if np.loadtxt is re-used, you HAVE TO set ndmin = 2 here
        X[l] = pdreadcsv_regexdelim(datafiles[l],
                                    delimiter=delimiter,
                                    dtype=dtype,
                                    skiprows=skiprows,
                                    usecols=range(skipcolumns, ncols),
                                    na_filter=data_na_filter,
                                    comments=comm)

        if skiprows > 0:
            skippedRows[l] = pdreadcsv_regexdelim(datafiles[l],
                                                  delimiter=delimiter,
                                                  dtype=str,
                                                  skiprows=0,
                                                  usecols=range(
                                                      skipcolumns, ncols),
                                                  na_filter=False,
                                                  comments=comm)[0:skiprows]
            if skiprows == 1:
                skippedRows[l] = skippedRows[l][0]
        else:
            skippedRows[l] = np.array([]).reshape([0, X[l].shape[1]])

        if skipcolumns > 0:
            skippedCols[l] = pdreadcsv_regexdelim(datafiles[l],
                                                  delimiter=delimiter,
                                                  dtype=str,
                                                  skiprows=skiprows,
                                                  usecols=range(skipcolumns),
                                                  na_filter=False,
                                                  comments=comm)
        else:
            skippedCols[l] = np.array([]).reshape([0, X[l].shape[1]])

    if returnSkipped:
        return (ds.listofarrays2arrayofarrays(X), skippedRows, skippedCols)
    else:
        return ds.listofarrays2arrayofarrays(X)
Ejemplo n.º 2
0
def reorderClusters(B, X, GDM, returnOrderIndices=False):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    Bloc = Bloc[:, np.any(Bloc, axis=0)]  # Only keep non-empty clusters

    B_ordered = np.zeros(Bloc.shape, dtype=bool)
    K = Bloc.shape[1]  # Number of clusters
    L = Xloc.shape[0]  # Number of datasets

    if K == 0:
        return Bloc

    # Find Cmeans and distances between clusters
    Cmeans = np.array([None] * L, dtype=object)
    D = np.full([K, K, L], np.inf)  # KxKxL  (initialised with inf values)
    for l in range(L):
        Cmeans[l] = np.zeros([K, Xloc[l].shape[1]],
                             dtype=float)  # (K) x (X[l] samples)
        for k in range(K):
            Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)

        # For empty clusters, the distances are inf (as filled in the initialisation of D),
        # For non-empty clusters, the distances are calculated by skdists.euclidean_distances between cmeans
        I_not_empty_c = ~np.array([np.any(np.isnan(cm)) for cm in Cmeans[l]
                                   ])  # Indices of clusters with nans (empty)
        I_not_empty_c = np.where(I_not_empty_c)[
            0]  # From boolean indices to integer indices
        non_empty_c_dists = skdists.euclidean_distances(
            Cmeans[l][I_not_empty_c])  # K_not_empty x K_not_empty
        for nec in range(len(I_not_empty_c)):
            D[I_not_empty_c[nec], I_not_empty_c, l] = non_empty_c_dists[nec]
        #D[:, :, l] = skdists.euclidean_distances(Cmeans[l])  # KxK
    D = np.median(D, axis=2)  # KxK

    # Set first cluster as first, then find closest by closest
    B_ordered[:, 0] = Bloc[:, 0]
    I = np.zeros(K, dtype=int)
    I[0] = 0
    clustersDone = np.zeros(K, dtype=bool)
    clustersDone[0] = True
    for k in range(1, K):
        relevantD = D[I[k - 1], ~clustersDone]
        clustersLeft = np.nonzero(~clustersDone)[0]
        nextCluster = np.argmin(relevantD)
        nextCluster = clustersLeft[nextCluster]
        B_ordered[:, k] = Bloc[:, nextCluster]
        I[k] = nextCluster
        clustersDone[nextCluster] = True

    if returnOrderIndices:
        return (B_ordered, I)
    else:
        return B_ordered
Ejemplo n.º 3
0
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                  OGsIncludedIfAtLeastInDatasets=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Genesloc = deepcopy(Genes)
    if Map is None:
        OGsDatasets = deepcopy(Genes)
        OGs = np.unique(ds.flattenAList(OGsDatasets))  # Unique list of genes (or mapped genes)
        MapNew = None
        MapSpecies = None
    else:
        (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader,
                                                                     OGsFirstColMap, delimGenesInMap)

    L = len(Genesloc)  # Number of datasets
    # Ng = len(OGs)  # Number of unique genes

    GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets])  # GDM: (Ng)x(L) boolean

    # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets
    IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets
    GDM = GDMall[IncludedOGs]
    OGs = OGs[IncludedOGs]
    if MapNew is not None:
        MapNew = MapNew[IncludedOGs]

    Ngs = np.sum(GDM, axis=0)  # Numbers of unique mapped genes in each dataset

    Xnew = np.array([None] * L, dtype=object)
    GenesDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        arelogs = arelogs_function(Xloc[l])
        #arelogs = np.nansum(abs(Xloc[l][~isnan(Xloc[l])]) < 30) > 0.98 * ds.numel(Xloc[l][~isnan(Xloc[l])])  # More than 98% of values are below 30.0
        d = Xloc[l].shape[1]  # Number of dimensions (samples) in this dataset
        Xnew[l] = np.zeros([Ngs[l], d], dtype=float)
        GenesDatasets[l] = np.empty(Ngs[l], dtype=object)
        OGsInThisDS = OGs[GDM[:, l]]  # Unique OGs in this dataset
        # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above)
        for ogi in range(len(OGsInThisDS)):
            og = OGsInThisDS[ogi]
            if arelogs:
                Xnew[l][ogi] = np.log2(np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0))
            else:
                Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0)
            GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(OGsDatasets[l], og)])

    return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
Ejemplo n.º 4
0
def reorderClusters(B, X, GDM, returnOrderIndices = False):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    Bloc = Bloc[:, np.any(Bloc, axis=0)]  # Only keep non-empty clusters

    B_ordered = np.zeros(Bloc.shape, dtype=bool)
    K = Bloc.shape[1]  # Number of clusters
    L = Xloc.shape[0]  # Number of datasets

    if K == 0:
        return Bloc

    # Find Cmeans and distances between clusters
    Cmeans = np.array([None] * L, dtype=object)
    D = np.zeros([K, K, L])  # KxKxL
    for l in range(L):
        Cmeans[l] = np.zeros([K, Xloc[l].shape[1]], dtype=float)  # (K) x (X[l] samples)
        for k in range(K):
            Cmeans[l][k] = np.mean(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
        D[:, :, l] = skdists.euclidean_distances(Cmeans[l])  # KxK
    D = np.median(D, axis=2)  # KxK

    # Set first cluster as first, then find closest by closest
    B_ordered[:, 0] = Bloc[:, 0]
    I = np.zeros(K, dtype=int)
    I[0] = 0
    clustersDone = np.zeros(K, dtype=bool)
    clustersDone[0] = True
    for k in range(1,K):
        relevantD = D[I[k-1], ~clustersDone]
        clustersLeft = np.nonzero(~clustersDone)[0]
        nextCluster = np.argmin(relevantD)
        nextCluster = clustersLeft[nextCluster]
        B_ordered[:, k] = Bloc[:, nextCluster]
        I[k] = nextCluster
        clustersDone[nextCluster] = True

    if returnOrderIndices:
        return (B_ordered, I)
    else:
        return B_ordered
Ejemplo n.º 5
0
def uncles(X,
           type='A',
           Ks=[n for n in range(4, 21, 4)],
           params=None,
           methods=None,
           methodsDetailed=None,
           U=None,
           Utype='PM',
           relabel_technique='minmin',
           setsP=None,
           setsN=None,
           dofuzzystretch=False,
           wsets=None,
           wmethods=None,
           GDM=None,
           smallestClusterSize=11,
           CoPaMfinetrials=1,
           CoPaMfinaltrials=1,
           binarise_techniqueP='DTB',
           binarise_paramP=np.arange(0.0, 1.1, 0.1, dtype='float'),
           binarise_techniqueN='DTB',
           binarise_paramN=np.concatenate(([sys.float_info.epsilon],
                                           np.arange(0.1,
                                                     1.1,
                                                     0.1,
                                                     dtype='float'))),
           Xnames=None,
           deterministic=False,
           ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = np.shape(Xloc)[0]  # Number of datasets
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    else:
        wsets = np.array(wsets)[setsPN]
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        GDMloc = GDM[:, setsPN]
        Ng = GDMloc.shape[0]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    if methods is None:
        methods = [['k-means']]
        # largest_DS = np.max([x.shape[0] for x in Xloc])
        # if (largest_DS <= maxgenesinsetforpdist):
        #    methods = [['k-means'], ['HC']]
        # else:
        #    methods = [['k-means']]
    else:
        largest_DS = np.max([x.shape[0] for x in Xloc])
        if (largest_DS > maxgenesinsetforpdist):
            methods = [
                m for m in methods
                if 'hc' not in [entry.lower() for entry in m]
            ]
            if not methods:
                io.log('No valid base clustering can be used. Please note that clust would not use HC clustering ' \
                       'on datasets with more than {0} genes. You have a dataset with {1} genes.' \
                       ''.format(maxgenesinsetforpdist, largest_DS))
                io.log('Clust will terminate here.')
                io.log(op.bottomline(), addextrastick=False)
                sys.exit()
    if methodsDetailed is None:
        methodsDetailedloc = np.array([methods for l in range(L)])
    else:
        methodsDetailedloc = methodsDetailed[setsPN]
    if wmethods is None:
        wmethods = [[1 for x in m] for m in methodsDetailedloc]
    elif not isinstance(wmethods[0], (list, tuple, np.ndarray)):
        wmethods = np.tile(methods, [L, 1])
    else:
        wmethods = np.array(wmethods)[setsPN]

    setsPloc = [ii for ii in range(len(setsP))]
    if L > len(setsPloc):
        setsNloc = [ii for ii in range(len(setsPloc), L)]

    Ks = np.array(Ks)
    Ks = Ks[Ks <= Ng]  # Remove Ks that are larger than the number of genes Ng
    Ks = Ks.tolist()
    NKs = len(Ks)  # Number of K values

    # If the dataset is empty, return basic output
    if Ng == 0:
        NPp = len(binarise_paramP)  # Number of P params
        NNp = len(binarise_paramN)  # Number of N params
        if type == 'A':
            B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
        elif type == 'B':
            B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
            Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

        params = dict(
            params, **{
                'methods':
                methods,
                'setsP':
                setsPloc,
                'setsN':
                setsNloc,
                'dofuzzystretch':
                dofuzzystretch,
                'type':
                type,
                'Ks':
                Ks,
                'NKs':
                NKs,
                'wsets':
                wsets,
                'wmethods':
                wmethods,
                'Ds':
                Ds,
                'L':
                L,
                'CoPaMs':
                np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
                    [CoPaMfinaltrials, NKs]),
                'smallestclustersize':
                smallestClusterSize,
                'GDM':
                GDMloc
            })

        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])

        UnclesRes = collections.namedtuple('UnclesRes',
                                           ['B', 'Mc', 'params', 'X', 'U'])
        return UnclesRes(B, Mc, params, Xloc, Uloc)

    # Clustering
    if U is None:
        Utype = 'PM'
        Uloc = np.array([None] * (L * NKs)).reshape([L, NKs])
        totalparallel = np.sum(Ks) * np.sum(
            [len(meths) for meths in methodsDetailedloc])
        for meths in methodsDetailedloc:
            for meth in meths:
                if 'k-means' in meth:
                    totalparallel += np.max(Ks) * np.max(Ks)
                    continue
        io.resetparallelprogress(totalparallel)

        for l in range(L):
            # Cache kmeans initialisations for the dataset once to save time:
            cl.cache_kmeans_init(Xloc[l],
                                 Ks,
                                 methodsDetailedloc[l],
                                 datasetID=l)

            # Now go to parallel clustering
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                Utmp = Parallel(n_jobs=ncores)\
                    (delayed(clustDataset)
                     (Xloc[l], Ks[ki], methodsDetailedloc[l], GDMloc[:, l], Ng, l) for ki in range(NKs))

                Utmp = [u for u in Utmp]
                for ki in range(NKs):
                    Uloc[l, ki] = Utmp[ki]

                gc.collect()
                #io.updateparallelprogress(np.sum(Ks) * len(methodsDetailedloc))

    else:
        Uloc = ds.listofarrays2arrayofarrays(U)[setsPN]

    # Calculate a CoPaM for each dataset at each K
    CoPaMsFine = np.array([None] * (L * NKs)).reshape([L, NKs])
    for l in range(L):
        for ki in range(NKs):
            if Utype.lower() == 'pm':
                CoPaMsFineTmp = [
                    generateCoPaM(Uloc[l, ki],
                                  relabel_technique=relabel_technique,
                                  X=[Xloc[l]],
                                  w=wmethods[l],
                                  K=Ks[ki],
                                  GDM=GDMloc[:, l].reshape([-1, 1]))
                    for i in range(CoPaMfinetrials)
                ]
            elif Utype.lower() == 'idx':
                CoPaMsFineTmp = \
                    [generateCoPaMfromidx(Uloc[l, ki], relabel_technique=relabel_technique, X=Xloc,
                                          w=wmethods[l], K=Ks[ki])
                     for i in range(CoPaMfinetrials)]
            else:
                raise ValueError('Invalid Utype')
            CoPaMsFine[l,
                       ki] = generateCoPaM(CoPaMsFineTmp,
                                           relabel_technique=relabel_technique,
                                           X=[Xloc[l]],
                                           GDM=GDMloc[:, l].reshape([-1, 1]))

            if dofuzzystretch:
                CoPaMsFine[l, ki] = fuzzystretch(CoPaMsFine[l, ki])

    # Calculate the final CoPaM for each K
    CoPaMs = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsP = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    CoPaMsN = np.array([None] * (CoPaMfinaltrials * NKs)).reshape(
        [CoPaMfinaltrials, NKs])
    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                if Utype.lower() == 'pm':
                    CoPaMs[t, ki] = generateCoPaM(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        w=wsets,
                        X=Xloc,
                        GDM=GDMloc)
                elif Utype.lower() == 'idx':
                    CoPaMs[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[:, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets,
                        GDM=GDMloc)
                else:
                    raise ValueError('Invalid Utype')
            elif type == 'B':
                if Utype.lower() == 'pm':
                    CoPaMsP[t, ki] = generateCoPaM(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaM(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                elif Utype.lower() == 'idx':
                    CoPaMsP[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsPloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsPloc],
                        GDM=GDMloc[:, setsPloc])
                    CoPaMsN[t, ki] = generateCoPaMfromidx(
                        CoPaMsFine[setsNloc, ki],
                        relabel_technique=relabel_technique,
                        X=Xloc,
                        w=wsets[setsNloc],
                        GDM=GDMloc[:, setsNloc])
                else:
                    raise ValueError('Invalid Utype')
            else:
                raise ValueError(
                    'Invalid UNCLES type. It has to be either A or B')

    # Binarise
    NPp = len(binarise_paramP)  # Number of P params
    NNp = len(binarise_paramN)  # Number of N params
    if type == 'A':
        B = np.zeros([CoPaMfinaltrials, NPp, 1, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)
    elif type == 'B':
        B = np.zeros([CoPaMfinaltrials, NPp, NNp, NKs], dtype=object)
        Mc = np.zeros([CoPaMfinaltrials, NKs], dtype=object)

    for t in range(CoPaMfinaltrials):
        for ki in range(NKs):
            if type == 'A':
                # Pre-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]

                # Sorting
                CoPaMs[t, ki] = sortclusters(CoPaMs[t, ki], Mc[t, ki],
                                             smallestClusterSize)

                # Post-sorting binarisation
                for p in range(NPp):
                    B[t, p, 0, ki] = binarise(CoPaMs[t,
                                                     ki], binarise_techniqueP,
                                              binarise_paramP[p])
                Mc[t, ki] = [np.sum(Bp, axis=0) for Bp in B[t, :, 0, ki]]
            elif type == 'B':
                # Pre-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # Sorting
                CoPaMsP[t, ki] = sortclusters(CoPaMsP[t, ki], McP,
                                              smallestClusterSize)
                CoPaMsN[t, ki] = sortclusters(CoPaMsN[t, ki], McN,
                                              smallestClusterSize)

                # Post-sorting binarisation
                BP = [
                    binarise(CoPaMsP[t, ki], binarise_techniqueP,
                             binarise_paramP[p]) for p in range(NPp)
                ]
                McP = [np.sum(BPp, axis=0) for BPp in BP]

                BN = [
                    binarise(CoPaMsN[t, ki], binarise_techniqueN,
                             binarise_paramN[p]) for p in range(NNp)
                ]
                McN = [np.sum(BNp, axis=0) for BNp in BN]

                # UNCLES B logic
                for pp in range(NPp):
                    for pn in range(NNp):
                        B[t, pp, pn, ki] = BP[pp]
                        B[t, pp, pn, ki][np.any(BN[pn], axis=1)] = False

                # Fill Mc
                Mc[t, ki] = [None] * Ks[ki]
                for k in range(Ks[ki]):
                    Mc[t, ki][k] = np.zeros([NPp, NNp])
                    for pp in range(NPp):
                        for pn in range(NNp):
                            Mc[t, ki][k][pp, pn] = np.sum(B[t, pp, pn, ki][:,
                                                                           k])

    # Prepare and return the results:
    params = dict(
        params, **{
            'methods': methods,
            'setsP': setsPloc,
            'setsN': setsNloc,
            'dofuzzystretch': dofuzzystretch,
            'type': type,
            'Ks': Ks,
            'NKs': NKs,
            'wsets': wsets,
            'wmethods': wmethods,
            'L': L,
            'CoPaMs': CoPaMs,
            'smallestclustersize': smallestClusterSize,
            'GDM': GDMloc
        })

    UnclesRes = collections.namedtuple('UnclesRes',
                                       ['B', 'Mc', 'params', 'X', 'U'])
    return UnclesRes(B, Mc, params, Xloc, Uloc)
Ejemplo n.º 6
0
def generateCoPaM(U,
                  relabel_technique='minmin',
                  w=None,
                  X=None,
                  distCriterion='direct_euc',
                  K=0,
                  GDM=None):
    # Helping functions
    def calwmeans(w):
        wm = [
            np.mean(calwmeans(ww)) if isinstance(ww,
                                                 (list, tuple,
                                                  np.ndarray)) else np.mean(ww)
            for ww in w
        ]
        return np.array(wm)

    def CoPaMsdist(CoPaM1, CoPaM2):
        return np.linalg.norm(CoPaM1 - CoPaM2)

    def orderpartitions(U, method='rand', X=None, GDM=None):
        if method == 'rand':
            return np.random.permutation(range(len(U))), None
        elif method == 'mn':
            # TODO: Implement ranking partitions based on M-N plots
            raise NotImplementedError(
                'Ranking partitions based on the M-N plots logic has not been implemented yet.'
            )
        elif method == 'mse':
            R = len(U)
            mses = np.zeros(R)
            for r in range(R):
                if isinstance(U[r][0][0], (list, tuple, np.ndarray)):
                    mses[r] = np.mean(
                        orderpartitions(U[r], method=method, X=X, GDM=GDM)[1])
                else:
                    mses[r] = np.mean([
                        mn.mseclustersfuzzy(X,
                                            U[r],
                                            donormalise=False,
                                            GDM=GDM)
                    ])
            order = np.argsort(mses)
            return order, mses[order]

    # Fix parameters
    Uloc = ds.listofarrays2arrayofarrays(U)
    R = len(Uloc)
    if GDM is None:
        GDMloc = np.ones([Uloc[0].shape[0], R], dtype=bool)
    elif GDM.shape[1] == 1:
        if R > 1:
            GDMloc = np.tile(GDM, [1, R])
        else:
            GDMloc = np.array(GDM)
    else:
        GDMloc = np.array(GDM)
    if w is None or (w is str and w in ['all', 'equal']):
        w = np.ones(R)
    elif ds.numel(w) == 1:
        w = np.array([w for i in range(R)])
    wmeans = calwmeans(w)

    # Work!
    #permR = orderpartitions(Uloc, method='rand', X=X, GDM=GDM)[0]
    if GDM is None:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=None)[0]
    else:
        permR = orderpartitions(Uloc, method='mse', X=X, GDM=GDMloc)[0]
    Uloc = Uloc[permR]
    if GDMloc.shape[1] > 1:
        GDMloc = GDMloc[:, permR]
    wmeans = wmeans[permR]

    if isinstance(Uloc[0][0][0], (list, tuple, np.ndarray)):
        Uloc[0] = generateCoPaM(Uloc[0],
                                relabel_technique=relabel_technique,
                                w=w[0],
                                X=X,
                                distCriterion=distCriterion,
                                K=K,
                                GDM=GDMloc)
    #CoPaM = np.zeros([GDMloc.shape[0], Uloc[0].shape[1]], float)
    CoPaM = np.array(Uloc[0], dtype=float)
    K = CoPaM.shape[1]
    for r in range(1, R):
        if isinstance(Uloc[r][0][0], (list, tuple, np.ndarray)):
            Uloc[r] = generateCoPaM(Uloc[r],
                                    relabel_technique=relabel_technique,
                                    w=w[r],
                                    X=X,
                                    distCriterion=distCriterion,
                                    K=K,
                                    GDM=GDMloc)
        if Uloc[r].shape[1] != K:
            raise ValueError(
                'Inequal numbers of clusters in the partition {}.'.format(r))

        Uloc[r] = relabelClusts(CoPaM,
                                Uloc[r],
                                method=relabel_technique,
                                X=X,
                                distCriterion=distCriterion)

        dotprod = np.dot(GDMloc[:, 0:r],
                         wmeans[0:r].transpose())  # (Mxr) * (rx1) = (Mx1)
        CoPaM[dotprod > 0] = nu.multiplyaxis(CoPaM[dotprod > 0],
                                             dotprod[dotprod > 0],
                                             axis=1)
        CoPaM[dotprod > 0] += wmeans[r] * Uloc[r][dotprod > 0]
        dotprod = np.dot(GDMloc[:, 0:(r + 1)], wmeans[0:(r + 1)].transpose())
        CoPaM[dotprod > 0] = nu.divideaxis(CoPaM[dotprod > 0],
                                           dotprod[dotprod > 0],
                                           axis=1)

    return CoPaM
Ejemplo n.º 7
0
def correcterrors_withinworse(B, X, GDM, falsepositivestrimmed=0.01):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Find clusters' means (Cmeans), absolute shifter clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            csize = np.sum(Bloc[GDM[:, l], k])
            tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                     Cmeans[l][k],
                                     axis=0)
            SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
            gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])
        SCG[l] = np.sort(SCG[l], axis=0)
        if falsepositivestrimmed > 0:
            trimmed = int(falsepositivestrimmed * SCG[l].shape[0])
            if trimmed > 0:
                SCG[l] = SCG[l][
                    0:-trimmed]  # trim the lowest (trimmed) rows in SCG

    # Helping function
    def iswithinworse(ref, x):
        return x <= np.max(ref)

    # Find who belongs
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            for d in range(Xloc[l].shape[1]):
                tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
                belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere
    B_out = np.all(belongs, axis=2)

    # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
    # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
    f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
    for fi in f:
        ficlusts = np.nonzero(B_out[fi])[0]  # Clusters competing over gene fi
        fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
        localdists = np.zeros(
            [len(ficlusts),
             len(fidatasets)])  # (Clusts competing) x (datasets that have fi)
        for l in range(len(fidatasets)):
            ll = fidatasets[l]  # Actual dataset index
            fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
            localdists[:, l] = nu.dist_matrices(
                Cmeans[ll][ficlusts], Xloc[ll][fi_ll]).reshape([len(ficlusts)])
        localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
        ficlosest = np.argmin(localdists)  # Closest cluster
        B_out[fi] = False
        B_out[fi, ficlusts[ficlosest]] = True

    return B_out
Ejemplo n.º 8
0
def optimise_tukey_sqrtSCG(B,
                           X,
                           GDM,
                           clustdists=None,
                           smallestClusterSize=11,
                           tails=1,
                           Q3s=2):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustdistsloc = np.ones(K)
    else:
        clustdistsloc = [c for c in clustdists]

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)

    Cgood = mnplotsdistancethreshold(clustdistsloc, method='largestgap')
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            if k in Cgood:
                csize = np.sum(Bloc[GDM[:, l], k])
                tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                         Cmeans[l][k],
                                         axis=0)
                SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
                gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])

        if ds.numel(SCG[l] > 0):
            if tails == 1:
                Q3 = np.percentile(SCG[l], q=75, axis=0)
                thresh = Q3s * Q3
                SCGouts = SCG[l] > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            elif tails == 2:
                Q1 = np.percentile(np.sqrt(SCG[l]), q=25, axis=0)
                Q3 = np.percentile(np.sqrt(SCG[l]), q=75, axis=0)
                IQR = np.subtract(Q3, Q1)
                thresh = np.add(Q3, 1.5 * IQR)
                SCGouts = np.sqrt(SCG[l]) > np.array(
                    [thresh for ii in range(0, SCG[l].shape[0])])
                SCG[l][
                    SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on
            else:
                raise ValueError(
                    'Invalid number of tails. It should be either 1 or 2.')
        else:
            SCG[l] = np.zeros((1, SCG[l].shape[1]))

    # Clusters mins and maxes (NEW)
    Cmins = np.array([None] * L, dtype=object)
    Cmaxes = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmins[l] = np.zeros([K, Xloc[l].shape[1]])  # K clusters x D dimensions
        Cmaxes[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        for k in range(K):
            Cmins[l][k] = Cmeans[l][k] - np.max(SCG[l], axis=0)
            Cmaxes[l][k] = Cmeans[l][k] + np.max(SCG[l], axis=0)

    # Resolve overlaps between clusters (NEW)
    for k1 in range(K):
        for k2 in range(K):
            # Compare the pair of clusters only once, and don't compare a cluster with itself. This if statement
            # guarantees that k2 will always be a later cluster than k1.
            if (k1 >= k2):
                continue
            # Value of the smallest overlap between the ranges of the clusters k1 and k2, and ...
            # the dataset (l) and the dimension (d), at which this overlap is found
            # t_smallest overlap is the type of the overlap, (-1, 0, 1, or 2). Type (-1) means that the entire (min
            # to max) range of one cluster is within the range of the other cluster. This is the worse overlap.
            # Type (0) means that the max of (k1) is within the range of (min to max) of (k2), and type (1) is the other
            # way around. Type (2) means there is no overlap. This is the best and finding one of it breaks the loop
            v_smallestoverlap = 0
            l_smallestoverlap = -1
            d_smallestoverlap = -1
            t_smallestoverlap = -1  # Overlap type, read above
            for l in range(L):
                Nd = len(Cmins[l][k1])  # Dimensions in this dataset
                for d in range(Nd):
                    x1 = Cmaxes[l][k1][d]
                    x2 = Cmaxes[l][k2][d]
                    n1 = Cmins[l][k1][d]
                    n2 = Cmins[l][k2][d]
                    if (x1 > n2 and x1 <= x2):
                        if (n1 < n2):
                            ov = x1 - n2
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 0
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    elif (x2 > n1 and x2 <= x1):
                        if (n2 < n1):
                            ov = x2 - n1
                            if (t_smallestoverlap == -1
                                    or ov < v_smallestoverlap):
                                t_smallestoverlap = 1
                                v_smallestoverlap = ov
                                l_smallestoverlap = l
                                d_smallestoverlap = d
                    else:
                        t_smallestoverlap = 2
                        continue  # Absolutely no overlap at this point, so k1 and k2 are distinct, so continue
                if (t_smallestoverlap == 2):
                    continue  # Absolutely no overlap at some point, so k1 and k2 are distinct, so continue

            # Sort out the overlap if exists between k1 and k2
            if (t_smallestoverlap == -1):
                # Here one of the two clusters always swallows the other one. So effectively remove the later one (k2).
                # Cluster removal is by making its minimum larger than its maximum at a single point (at l=0, d=0),
                # so effectively no gene will ever be mapped to it!
                Cmins[0][k2][0] = 1
                Cmaxes[0][k2][0] = 0
            elif (t_smallestoverlap == 0):
                Cmins[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmaxes[l_smallestoverlap][k1][d_smallestoverlap] + sys.float_info.epsilon
            elif (t_smallestoverlap == 1):
                Cmaxes[l_smallestoverlap][k2][d_smallestoverlap] = \
                    Cmins[l_smallestoverlap][k1][d_smallestoverlap] - sys.float_info.epsilon

    # Find who belongs (NEW)
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            tmp1 = nu.largerthanaxis(Xloc[l],
                                     Cmins[l][k],
                                     axis=0,
                                     orequal=True)
            tmp2 = nu.lessthanaxis(Xloc[l], Cmaxes[l][k], axis=0, orequal=True)
            belongs[GDM[:, l], k, l] = np.all(np.logical_and(tmp1, tmp2),
                                              axis=1)

    # # Helping function (OLD - to be removed)
    # def iswithinworse(ref, x):
    #     return x <= np.max(ref)
    #
    # # Find who belongs (OLD - to be removed)
    # belongs = np.ones([Ng, K, L], dtype=bool)  # Ng genes x K clusters x L datasets
    # for l in range(L):
    #     for k in range(K):
    #         for d in range(Xloc[l].shape[1]):
    #             tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
    #             belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere (OLD - to be removed)
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters (OLD - should not be needed now - TO BE REMOVED)
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out
Ejemplo n.º 9
0
def correcterrors_weighted_outliers2(B,
                                     X,
                                     GDM,
                                     clustdists=None,
                                     stds=3,
                                     smallestClusterSize=11):
    Bloc = np.array(B)
    Xloc = ds.listofarrays2arrayofarrays(X)

    [Ng, K] = Bloc.shape  # Ng genes and K clusters
    L = Xloc.shape[0]  # L datasets

    # Normalise clustdists to provide weights. If not provided, make it unity for all
    if clustdists is None:
        clustweights = np.ones(K)
    else:
        clustweights = np.min(clustdists) / clustdists

    # Find clusters' means (Cmeans), absolute shifted clusters genes (SCG),
    # and the emperical CDF functions for them (cdfs)
    Cmeans = np.array([None] * L, dtype=object)
    SCG = np.array([None] * L, dtype=object)
    for l in range(L):
        Cmeans[l] = np.zeros([K,
                              Xloc[l].shape[1]])  # K clusters x D dimensions
        SCG[l] = np.zeros(
            [np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)),
             Xloc[l].shape[1]])  # M* genes x D dimensions ...
        w = np.zeros(np.sum(np.sum(Bloc[GDM[:, l], :], axis=0)))  # M* genes
        # (M* are all # genes in any cluster)

        gi = 0
        for k in range(K):
            Cmeans[l][k] = np.median(Xloc[l][Bloc[GDM[:, l], k], :], axis=0)
            csize = np.sum(Bloc[GDM[:, l], k])
            tmpSCG = nu.subtractaxis(Xloc[l][Bloc[GDM[:, l], k], :],
                                     Cmeans[l][k],
                                     axis=0)
            SCG[l][gi:(gi + csize), :] = np.abs(tmpSCG)
            # Added this in this version
            w[gi:(gi + csize)] = clustweights[k]
            gi += csize
        SCG[l] = SCG[l][np.any(
            SCG[l], axis=1)]  # Remove all zeros genes (rows of SCG[l])
        SCG[l] = np.sort(SCG[l], axis=0)
        SCGmeans = np.average(SCG[l], weights=w, axis=0)
        SCGstds = st.weighted_std_axis(SCG[l], weights=w, axix=0)
        SCGouts = nu.divideaxis(nu.subtractaxis(SCG[l], SCGmeans, axis=0),
                                SCGstds,
                                axis=0)  # No. of stds away
        SCGouts = SCGouts > stds  # TRUE for outliers and FALSE for others (bool: M* genex x D dimensions)
        SCG[l][
            SCGouts] = 0.0  # Set the outlier values to zeros so they do not affect decisions later on

    # Helping function
    def iswithinworse(ref, x):
        return x <= np.max(ref)

    # Find who belongs
    belongs = np.ones([Ng, K, L],
                      dtype=bool)  # Ng genes x K clusters x L datasets
    for l in range(L):
        for k in range(K):
            for d in range(Xloc[l].shape[1]):
                tmpX = np.abs(Xloc[l][:, d] - Cmeans[l][k, d])
                belongs[GDM[:, l], k, l] &= iswithinworse(SCG[l][:, d], tmpX)

    # Include in clusters genes which belongs everywhere
    B_out = np.all(belongs, axis=2)

    # Solve genes included in two clusters:
    solution = 2
    if solution == 1:
        # Genes included in two clusters, include them in the closest in terms of its worst distance to any of the clusters
        # (guarrantee that the worst belongingness of a gene to a cluster is optimised)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            fidatasets = np.nonzero(GDM[fi])[0]  # Datasets that have gene fi
            localdists = np.zeros([
                len(ficlusts), len(fidatasets)
            ])  # (Clusts competing) x (datasets that have fi)
            for l in range(len(fidatasets)):
                ll = fidatasets[l]  # Actual dataset index
                fi_ll = np.sum(GDM[:fi, ll])  # Index of fi in this Xloc[ll]
                localdists[:, l] = nu.dist_matrices(Cmeans[ll][ficlusts],
                                                    Xloc[ll][fi_ll]).reshape(
                                                        [len(ficlusts)])
            localdists = np.max(localdists, axis=1)  # (Clusts competing) x 1
            ficlosest = np.argmin(localdists)  # Closest cluster
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True
    elif solution == 2:
        # Genes included in two clusters, include them in the earlier cluster (smallest k)
        f = np.nonzero(np.sum(B_out, axis=1) > 1)[0]
        for fi in f:
            ficlusts = np.nonzero(
                B_out[fi])[0]  # Clusters competing over gene fi
            ficlosest = np.argmin(ficlusts)  # earliest cluster (smallest k)
            B_out[fi] = False
            B_out[fi, ficlusts[ficlosest]] = True

    # Remove clusters smaller than minimum cluster size
    ClusterSizes = np.sum(B_out, axis=0)
    B_out = B_out[:, ClusterSizes >= smallestClusterSize]

    return B_out
Ejemplo n.º 10
0
def preprocess(X, GDM, normalise=1000, replicatesIDs=None, flipSamples=None, expressionValueThreshold=10.0,
               replacementVal=0.0, atleastinconditions=1, atleastindatasets=1, absvalue=False, usereplacementval=False,
               filteringtype='raw', filterflat=True, params=None, datafiles=None):
    # Fixing parameters
    Xloc = ds.listofarrays2arrayofarrays(X)
    L = len(Xloc)
    if datafiles is None:
        if L == 1:
            datafiles = ['X']
        else:
            datafiles = np.array([], dtype=str)
            for i in range(L):
                datafiles = np.append(datafiles, 'X{0}'.format(i+1))
    if params is None:
        params = {}
    if replicatesIDs is None:
        replicatesIDsloc = [np.array([ii for ii in range(x.shape[1])]) for x in Xloc]
    else:
        replicatesIDsloc = ds.listofarrays2arrayofarrays(replicatesIDs)
        replicatesIDsloc = [np.array(x) for x in replicatesIDsloc]
    if flipSamples is None:
        flipSamplesloc = None
    else:
        flipSamplesloc = ds.listofarrays2arrayofarrays(flipSamples)
        flipSamplesloc = [np.array(x) for x in flipSamplesloc]
    # Revise if the if statement below is accurate!
    if not isinstance(normalise, (list, tuple, np.ndarray)):
        normaliseloc = [normalise if isinstance(normalise, (list, tuple, np.ndarray))
                        else [normalise] for i in range(L)]
        normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc)
    else:
        normaliseloc = [nor if isinstance(nor, (list, tuple, np.ndarray)) else [nor] for nor in normalise]
        normaliseloc = ds.listofarrays2arrayofarrays(normaliseloc)

    # Get rid of nans by fixing
    Xproc = Xloc
    for l in range(L):
        Xproc[l] = fixnans(Xproc[l])

    # Prepare applied_norm dictionary before any normalisation takes place
    applied_norm = collec.OrderedDict(zip(datafiles, deepcopy(normaliseloc)))

    # Tell the user if any automatic normalisation is taking place
    allare1000 = True
    anyis1000 = False
    for l in range(L):
        if 1000 in normaliseloc[l]:
            anyis1000 = True
        else:
            allare1000 = False
    if allare1000:
        io.log(' - Automatic normalisation mode (default in v1.7.0+).')
        io.log('   Clust automatically normalises your dataset(s).')
        io.log('   To switch it off, use the `-n 0` option (not recommended).')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')
    elif anyis1000:
        io.log(' - Some datasets are not assigned normalisation codes in the provided')
        io.log('   normalisation file. Clust automatically identifies and applies the')
        io.log('   most suitable normalisation to them (default in v1.7.0+).')
        io.log('   If you don''t want clust to normalise them, assign each of them a')
        io.log('   normalisation code of 0 in the normalisation file.')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')

    # Quantile normalisation
    for l in range(L):
        if 101 in normaliseloc[l] or 1000 in normaliseloc[l]:
            Xproc[l] = normaliseSampleFeatureMat(Xproc[l], 101)[0]
            if 101 in normaliseloc[l]:
                i = np.argwhere(np.array(normaliseloc[l]) == 101)
                i = i[0][0]
                normaliseloc[l][i] = 0

    # Combine replicates and sort out flipped samples
    Xproc = combineReplicates(Xproc, replicatesIDsloc, flipSamplesloc)

    # Filter genes not exceeding the threshold
    (Xproc, GDMnew, Iincluded) = filterlowgenes(Xproc, GDM, expressionValueThreshold, replacementVal,
                                                atleastinconditions, atleastindatasets, absvalue,
                                                usereplacementval, filteringtype)

    # Normalise
    for l in range(L):
        (Xproc[l], codes) = normaliseSampleFeatureMat(Xproc[l], normaliseloc[l])
        if np.all(codes == normaliseloc[l]):
            applied_norm[datafiles[l]] = op.arraytostring(applied_norm[datafiles[l]], delim=' ', openbrac='',
                                                          closebrac='')
        else:
            applied_norm[datafiles[l]] = op.arraytostring(codes, delim=' ', openbrac='', closebrac='')

    if filterflat:
        io.log(' - Flat expression profiles filtered out (default in v1.7.0+).')
        io.log('   To switch it off, use the --no-fil-flat option (not recommended).')
        io.log('   Check https://github.com/BaselAbujamous/clust for details.')
        (Xproc, GDMnew, Iincluded) = filterFlat(Xproc, GDMnew, Iincluded)

    # Prepare params for the output
    params = dict(params, **{
        'normalise': normaliseloc,
        'replicatesIDs': replicatesIDs,
        'flipSamples': flipSamplesloc,
        'L': L
    })

    return Xproc, GDMnew, Iincluded, params, applied_norm
Ejemplo n.º 11
0
def mnplotsgreedy(X, B, type='A', params=None, allMSE=None, tightnessweight=1, setsP=None, setsN=None, Xtype='data',
                      mseCache=None, wsets=None, GDM=None, msesummary='average', percentageOfClustersKept=100,
                      smallestClusterSize=11, Xnames=None, ncores=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Bloc = ds.reduceToArrayOfNDArraysAsObjects(B, 2)
    L = Xloc.shape[0]  # Number of datasets

    # Fix parameters
    if params is None: params = {}
    if setsP is None: setsP = [x for x in range(int(math.floor(L / 2)))]
    if setsN is None: setsN = [x for x in range(int(math.floor(L / 2)), L)]
    setsPN = np.array(np.concatenate((setsP, setsN), axis=0), dtype=int)
    Xloc = Xloc[setsPN]
    L = Xloc.shape[0]
    if wsets is None:
        wsets = np.array([1 for x in range(L)])
    if GDM is None:
        Ng = np.shape(Xloc[0])[0]
        GDMloc = np.ones([Ng, L], dtype='bool')
    else:
        Ng = np.shape(GDM)[0]
        GDMloc = GDM[:, setsPN]
    if Xnames is None:
        Xnames = ['X{0}'.format(l) for l in range(L)]

    # Put all clusters in one matrix
    N = Bloc.shape[0]  # Number of partitions
    K = [Bloc[i].shape[1] for i in range(N)]  # Number of clusters in each partition

    # One big matrix for all clusters
    BB = Bloc[0]
    for n in range(1, N):
        BB = np.append(BB, Bloc[n], axis=1)
    VMc = np.sum(BB, axis=0)
    NN = len(VMc)  # Total number of clusters

    # Fill Vmse if not provided
    if mseCache is None and allMSE is None:
        # Cache all mse values
        mseCache = np.zeros([NN, L])
        io.resetparallelprogress(NN * L)
        for l in range(L):
            if Xtype == 'files':
                # load files here
                raise NotImplementedError('Xtype "files" has not been implemented yet.')
            elif Xtype == 'data':
                Xtmp = Xloc[l]
            else:
                raise ValueError('Xtype has to be "files" or "data". The given Xtype is invalid.')

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                mseCachetmp = Parallel(n_jobs=ncores)\
                    (delayed(mseclusters)
                     (Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0) for nn in range(NN))
                mseCachetmp = [mm[0] for mm in mseCachetmp]
                for nn in range(NN):
                    mseCache[nn, l] = mseCachetmp[nn]

                gc.collect()

                #io.updateparallelprogress(NN)

            '''
            for nn in range(NN):
                mseCache[nn, l] = mseclusters(Xtmp, ds.matlablike_index2D(BB, GDMloc[:, l], nn), 0)[0]
            io.log('Done cluster evaluation for {0} have been calculated.'.format(Xnames[l]))
            '''

    # Calculate allMSE if needed (Nx1)
    if allMSE is None:
        if type == 'A':
            wsetsloc = wsets[setsPN]
            wsetsloc = [float(n)/sum(wsetsloc) for n in wsetsloc]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsPN], wsets)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsPN], wsets), axis=1)
            else:
                raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".',
                                 ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.')
        elif type == 'B':
            wsetsP = wsets[setsP]
            wsetsP = [n/sum(wsetsP) for n in wsetsP]
            wsetsN = wsets[setsN]
            wsetsN = [n / sum(wsetsN) for n in wsetsN]
            if msesummary == 'average' or msesummary == 'mean':
                allMSE = np.dot(mseCache[:, setsP] , wsetsP) - np.dot(mseCache[:, setsN] , wsetsN)
            elif msesummary == 'worse' or msesummary == 'max':
                allMSE = np.max(np.multiply(mseCache[:, setsP], wsetsP), axis=1) \
                         - np.max(np.multiply(mseCache[:, setsN], wsetsN), axis=1)
            else:
                raise ValueError('msesummary value has to be "average", "mean", "worse", or "max".',
                                 ' "average and "mean" behave similarly, and "worse" and "max" behave similarly.')
        else:
            raise ValueError('Type should be either A or B; given type is invalid.')

    # Find the distances
    maxx = np.max(allMSE[~np.isnan(allMSE)])
    minx = np.min(allMSE[~np.isnan(allMSE)])
    maxy = np.log10(np.max(VMc))
    miny = 0
    with np.errstate(divide='ignore'):
        allVecs = np.concatenate(([(allMSE - minx) / (maxx - minx)],
                                  [(np.log10(VMc) - miny) / (maxy - miny)]), axis=0).transpose()
    allVecs[:, 0] *= tightnessweight
    allDists = np.array([np.sqrt(1.1 + np.power(tightnessweight, 2)) if np.any(np.isnan(n))
                         else sp.spatial.distance.euclidean(n, [0, 1]) for n in allVecs])
    alpha = 0.0001
    tmp, uVdsI = np.unique(allDists, return_index=True)
    while len(uVdsI) != len(allDists):
        for n in range(len(allDists)):
            if n not in uVdsI:
                allDists[n] += alpha * sp.random.normal()
        tmp, uVdsI = np.unique(allDists, return_index=True)

    # Helper function for greedy solution below
    def mngreedy(Bloc, I, Vds, iter=float('inf')):
        Vdsloc = np.array(Vds)
        res = np.array([False for n in Vdsloc])
        if iter == 0 or not any(I):
            return res
        for n in range(len(I)):
            if not I[n]:
                Vdsloc[n] = float('inf')
        p = np.argmin(Vdsloc)
        res[p] = True
        #II = I
        overlaps = np.dot(ds.matlablike_index2D(Bloc, 'all', p).transpose(), Bloc) > 0
        I &= ~overlaps
        return res | mngreedy(Bloc, I, Vdsloc, iter-1)

    # ** Find greedy solution **
    # Sort clusters based on distances (not important, but benefits the output)
    II = np.argsort(allDists)
    allDists = allDists[II]
    BB = ds.matlablike_index2D(BB, 'a', II)
    allVecs = ds.matlablike_index2D(allVecs, II, 'a')
    allMSE = allMSE[II]
    mseCache = ds.matlablike_index2D(mseCache, II, 'a')
    VMc = VMc[II]

    # include the top XX% of the clusters that have at least smallestClusterSize
    Ismall = VMc < smallestClusterSize
    Inans = np.isnan(allDists)
    tmpDists = [np.max(allDists) if Inans[n] | Ismall[n] else allDists[n] for n in range(len(allDists))]
    percentageOfClustersKept *= float(np.sum(~Ismall)) / len(allDists)
    Iincluded = (tmpDists <= np.percentile(tmpDists, percentageOfClustersKept)) & (np.bitwise_not(Ismall))
    I = mngreedy(BB, Iincluded, allDists)
    B_out = ds.matlablike_index2D(BB, 'a', I)

    # Prepare and return the results:
    params = dict(params, **{
        'tightnessweight': tightnessweight,
        'msesummary': msesummary,
        'percentageofclusterskept': percentageOfClustersKept,
        'smallestclustersize': smallestClusterSize
    })

    MNResults = collections.namedtuple('MNResults',
                                       ['B', 'I', 'allVecs', 'allDists', 'allMSE', 'mseCache', 'Ball', 'params'])
    return MNResults(B_out, I, allVecs, allDists, allMSE, mseCache, BB, params)