예제 #1
0
def mapGenesToCommonIDs(Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+'):
    L = len(Genes)  # Number of datasets (i.e. lists of gene names)
    Maploc = np.array(Map, dtype=object)
    if mapheader:
        MapSpecies = Maploc[0]
        Maploc = Maploc[1:]
    else:
        MapSpecies = None

    # If the OG IDs are given in the Map, use them; otherwise generate them as OG0000000 to OGxxxxxxx
    if OGsFirstColMap:
        OGs = Maploc[:, 0].flatten()
        Maploc = Maploc[:, 1:]
        if MapSpecies is None:
            MapSpecies = np.array(['Species{}'.format(i) for i in range(Maploc.shape[1])])
        else:
            MapSpecies = MapSpecies[1:]
    else:
        OGs = np.array(['OG%07d' % i for i in range(Maploc.shape[0])])

    # !!!!!!!!TRANSPOSE MAP!!!!!!!!
    Maploc = Maploc.transpose()  # Now this is: Maploc[species][gene]

    # Split Map entries by the delim
    for i in range(Maploc.shape[0]):
        for j in range(Maploc.shape[1]):
            Maploc[i, j] = re.split(delimGenesInMap, Maploc[i, j].replace('.', 'thisisadot').replace('-', 'thisisadash').replace('/', 'thisisaslash'))
            Maploc[i, j] = [gg.replace('thisisadot', '.').replace('thisisadash', '-').replace('thisisaslash', '/') for gg in Maploc[i, j]]

    # Generate a flattened version of the Map: FlattenedMap[s] is a 1d list of all genes in the (s)th Map row, i.e.
    # in the (s)th species; this will make FlattenedMap[s1][n] not necessarily corresponding to FlattenedMap[s2][n])
    # S = Maploc.shape[0]  # Number of species
    FlattenedMap = [np.array(ds.flattenAList(ms.tolist())) for ms in Maploc]

    OGsDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        Ng = len(Genes[l])  # Number of genes in this dataset
        s = np.argmax([len(np.intersect1d(Genes[l], speciesgenes))
                       for speciesgenes in FlattenedMap])  # The most matching species

        OGsDatasets[l] = np.array(['' for i in range(Ng)], dtype=object)  # Default gene name for unmapped genes is ''
        findGenesInMap = ds.findArrayInSubArraysOfAnotherArray1D(Genes[l], Maploc[s])  # Indices of Genes in Map (Ngx1)
        findGenesInMap = findGenesInMap[:,0]  # Make it flat (length of Ng instead of array of Ngx1)
        OGsDatasets[l][findGenesInMap > -1] = OGs[findGenesInMap[findGenesInMap > -1]]

    OGsFiltered = np.unique(ds.flattenAList(OGsDatasets.flatten().tolist()))  # Get sorted unique and *USED* OGs
    OGsFiltered = OGsFiltered[OGsFiltered != '']
    I = ds.findArrayInAnotherArray1D(OGsFiltered, OGs)
    Maploc = Maploc.transpose()[I]

    # Return
    return OGsFiltered, OGsDatasets, Maploc, MapSpecies
예제 #2
0
def clusters_genes_Species(B, OGs, Map, MapSpecies):
    Nsp = len(MapSpecies)  # Number of species
    K = B.shape[1]  # Number of clusters

    # Find flattened genes in species
    flatGenesInSpecies = [[
        ds.flattenAList(Map[B[:, k], sp]) for k in range(K)
    ] for sp in range(Nsp)]  # Nsp x K lists

    # Prepare the results object
    Csizes = [[len(sp_k_genes) for sp_k_genes in sp_genes]
              for sp_genes in flatGenesInSpecies]  # Nsp x K
    maxCsizes = [np.max(csizes_sp) for csizes_sp in Csizes]  # Nsp x 1
    res = np.array([None] * Nsp, dtype=object)
    resFrames = np.array([None] * Nsp, dtype=object)

    # Fill the results object, species by species
    for sp in range(Nsp):
        restmp = np.array(np.empty([maxCsizes[sp], K], dtype=str),
                          dtype=object)
        header = np.array([None] * K, dtype=object).reshape([1, K])
        for k in range(K):
            restmp[0:Csizes[sp][k], k] = flatGenesInSpecies[sp][k]
            restmp[Csizes[sp][k]:, k] = ''
            header[0, k] = 'C{0} ({1} genes)'.format(k, Csizes[sp][k])
        res[sp] = np.array(np.concatenate((header, restmp), axis=0), dtype=str)
        resFrames[sp] = pd.DataFrame(data=res[sp],
                                     columns=None,
                                     index=None,
                                     dtype=str)

    return resFrames
예제 #3
0
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+',
                                  OGsIncludedIfAtLeastInDatasets=1):
    Xloc = ds.listofarrays2arrayofarrays(X)
    Genesloc = deepcopy(Genes)
    if Map is None:
        OGsDatasets = deepcopy(Genes)
        OGs = np.unique(ds.flattenAList(OGsDatasets))  # Unique list of genes (or mapped genes)
        MapNew = None
        MapSpecies = None
    else:
        (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader,
                                                                     OGsFirstColMap, delimGenesInMap)

    L = len(Genesloc)  # Number of datasets
    # Ng = len(OGs)  # Number of unique genes

    GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets])  # GDM: (Ng)x(L) boolean

    # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets
    IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets
    GDM = GDMall[IncludedOGs]
    OGs = OGs[IncludedOGs]
    if MapNew is not None:
        MapNew = MapNew[IncludedOGs]

    Ngs = np.sum(GDM, axis=0)  # Numbers of unique mapped genes in each dataset

    Xnew = np.array([None] * L, dtype=object)
    GenesDatasets = np.array([None] * L, dtype=object)
    for l in range(L):
        arelogs = arelogs_function(Xloc[l])
        #arelogs = np.nansum(abs(Xloc[l][~isnan(Xloc[l])]) < 30) > 0.98 * ds.numel(Xloc[l][~isnan(Xloc[l])])  # More than 98% of values are below 30.0
        d = Xloc[l].shape[1]  # Number of dimensions (samples) in this dataset
        Xnew[l] = np.zeros([Ngs[l], d], dtype=float)
        GenesDatasets[l] = np.empty(Ngs[l], dtype=object)
        OGsInThisDS = OGs[GDM[:, l]]  # Unique OGs in this dataset
        # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above)
        for ogi in range(len(OGsInThisDS)):
            og = OGsInThisDS[ogi]
            if arelogs:
                Xnew[l][ogi] = np.log2(np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0))
            else:
                Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0)
            GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(OGsDatasets[l], og)])

    return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies