Esempio n. 1
0
def processed_X(Xprocessed, conditions, GDM, OGs, Map, MapSpecies):
    L = len(Xprocessed)
    res = np.array([None] * L, dtype=object)
    resData = np.array([None] * L, dtype=object)
    resHeader = np.array([None] * L, dtype=object)
    resGeneNames = np.array([None] * L, dtype=object)

    for l in range(L):
        # Header (Samples)
        #samploc = np.array(Samples[l])
        #uniqueSamploc = np.unique(SamplesIDs[l])
        #uniqueSamploc = uniqueSamploc[uniqueSamploc >= 0]
        #resHeader[l] = [samploc[np.array(SamplesIDs[l]) == s][0] for s in uniqueSamploc]
        resHeader[l] = conditions[l]
        if Map is None:
            resHeader[l] = np.array([['Genes'] + resHeader[l]])
        else:
            resHeader[l] = np.array([['OGs'] + MapSpecies.tolist() + resHeader[l]])

        # Gene names
        if Map is None:
            resGeneNames[l] = OGs[GDM[:, l]].reshape(-1,1)
        else:
            genenames = [[ds.concatenateStrings(gs) for gs in Map[GDM[:, l]][:, sp]] for sp in range(Map.shape[1])]
            resGeneNames[l] = np.concatenate((OGs[GDM[:, l]].reshape(-1,1), np.transpose(genenames)), axis=1)

        # Data
        resData[l] = np.array(Xprocessed[l])

        # concatenate them
        res[l] = np.concatenate((resGeneNames[l], resData[l]), axis=1)
        res[l] = np.concatenate((resHeader[l], res[l]), axis=0)
        res[l] = np.array(res[l], dtype=str)

    return res
Esempio n. 2
0
def calculateGDMandUpdateDatasets(X,
                                  Genes,
                                  Map=None,
                                  mapheader=True,
                                  OGsFirstColMap=True,
                                  delimGenesInMap='\\W+',
                                  OGsIncludedIfAtLeastInDatasets=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Genesloc = deepcopy(Genes)
    if Map is None:
        OGsDatasets = deepcopy(Genes)
        OGs = np.unique(ds.flattenAList(
            OGsDatasets))  # Unique list of genes (or mapped genes)
        MapNew = None
        MapSpecies = None
    else:
        (OGs, OGsDatasets, MapNew,
         MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader,
                                           OGsFirstColMap, delimGenesInMap)

    L = len(Genesloc)  # Number of datasets
    # Ng = len(OGs)  # Number of unique genes

    GDMall = np.transpose([np.in1d(OGs, gs)
                           for gs in OGsDatasets])  # GDM: (Ng)x(L) boolean

    # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets
    IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets
    GDM = GDMall[IncludedOGs]
    OGs = OGs[IncludedOGs]
    if MapNew is not None:
        MapNew = MapNew[IncludedOGs]

    Ngs = np.sum(GDM, axis=0)  # Numbers of unique mapped genes in each dataset

    Xnew = np.array([None] * L, dtype=object)
    GenesDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        arelogs = np.nansum(
            abs(Xloc[l][~isnan(Xloc[l])]) < 30
        ) > 0.98 * ds.numel(
            Xloc[l][~isnan(Xloc[l])])  # More than 98% of values are below 30.0
        d = Xloc[l].shape[1]  # Number of dimensions (samples) in this dataset
        Xnew[l] = np.zeros([Ngs[l], d], dtype=float)
        GenesDatasets[l] = np.empty(Ngs[l], dtype=object)
        OGsInThisDS = OGs[GDM[:, l]]  # Unique OGs in this dataset
        # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above)
        for ogi in range(len(OGsInThisDS)):
            og = OGsInThisDS[ogi]
            if arelogs:
                Xnew[l][ogi] = np.log2(
                    np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]),
                           axis=0))
            else:
                Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)],
                                      axis=0)
            GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(
                OGsDatasets[l], og)])

    return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies
Esempio n. 3
0
def clusters_genes_OGs(B, OGs, Map, MapSpecies, delim='; '):
    if Map is None:
        Nsp = 0
    else:
        Nsp = len(MapSpecies)  # Number of species
    K = B.shape[1]  # Number of clusters
    if K == 0:
        return np.array(np.empty([1, 1]), dtype=object)
    Csizes = np.sum(B, axis=0)  # Clusters' sizes
    maxCsize = np.max(Csizes)  # Largest cluster size
    res = np.array(np.empty([maxCsize, (Nsp + 1) * K], dtype=str), dtype=object)
    header = np.array([None] * ((Nsp + 1) * K * 2), dtype=object).reshape([2, ((Nsp + 1) * K)])
    for k in range(K):
        col = k * (Nsp + 1)
        res[0:Csizes[k], col] = OGs[B[:, k]]
        res[Csizes[k]:, col] = ''
        header[0, col] = 'C{0} ({1} {2})'.format(k, Csizes[k], 'genes' if Map is None else 'OGs')
        header[1, col] = 'Genes' if Map is None else 'OGs'
        for sp in range(Nsp):  # Will not get into this if Map is None, as Nsp will be zero in that case
            col = k * (Nsp + 1) + sp + 1
            res[0:Csizes[k], col] = [ds.concatenateStrings(gs, delim) for gs in Map[B[:, k], sp]]
            res[Csizes[k]:, col] = ''
            header[0, col] = ''
            header[1, col] = MapSpecies[sp]


    return np.array(np.concatenate((header, res), axis=0), dtype=str)