def mapGenesToCommonIDs(Genes, Map, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+'): L = len(Genes) # Number of datasets (i.e. lists of gene names) Maploc = np.array(Map, dtype=object) if mapheader: MapSpecies = Maploc[0] Maploc = Maploc[1:] else: MapSpecies = None # If the OG IDs are given in the Map, use them; otherwise generate them as OG0000000 to OGxxxxxxx if OGsFirstColMap: OGs = Maploc[:, 0].flatten() Maploc = Maploc[:, 1:] if MapSpecies is None: MapSpecies = np.array(['Species{}'.format(i) for i in range(Maploc.shape[1])]) else: MapSpecies = MapSpecies[1:] else: OGs = np.array(['OG%07d' % i for i in range(Maploc.shape[0])]) # !!!!!!!!TRANSPOSE MAP!!!!!!!! Maploc = Maploc.transpose() # Now this is: Maploc[species][gene] # Split Map entries by the delim for i in range(Maploc.shape[0]): for j in range(Maploc.shape[1]): Maploc[i, j] = re.split(delimGenesInMap, Maploc[i, j].replace('.', 'thisisadot').replace('-', 'thisisadash').replace('/', 'thisisaslash')) Maploc[i, j] = [gg.replace('thisisadot', '.').replace('thisisadash', '-').replace('thisisaslash', '/') for gg in Maploc[i, j]] # Generate a flattened version of the Map: FlattenedMap[s] is a 1d list of all genes in the (s)th Map row, i.e. # in the (s)th species; this will make FlattenedMap[s1][n] not necessarily corresponding to FlattenedMap[s2][n]) # S = Maploc.shape[0] # Number of species FlattenedMap = [np.array(ds.flattenAList(ms.tolist())) for ms in Maploc] OGsDatasets = np.array([None] * L, dtype=object) for l in range(L): Ng = len(Genes[l]) # Number of genes in this dataset s = np.argmax([len(np.intersect1d(Genes[l], speciesgenes)) for speciesgenes in FlattenedMap]) # The most matching species OGsDatasets[l] = np.array(['' for i in range(Ng)], dtype=object) # Default gene name for unmapped genes is '' findGenesInMap = ds.findArrayInSubArraysOfAnotherArray1D(Genes[l], Maploc[s]) # Indices of Genes in Map (Ngx1) findGenesInMap = findGenesInMap[:,0] # Make it flat (length of Ng instead of array of Ngx1) OGsDatasets[l][findGenesInMap > -1] = OGs[findGenesInMap[findGenesInMap > -1]] OGsFiltered = np.unique(ds.flattenAList(OGsDatasets.flatten().tolist())) # Get sorted unique and *USED* OGs OGsFiltered = OGsFiltered[OGsFiltered != ''] I = ds.findArrayInAnotherArray1D(OGsFiltered, OGs) Maploc = Maploc.transpose()[I] # Return return OGsFiltered, OGsDatasets, Maploc, MapSpecies
def clusters_genes_Species(B, OGs, Map, MapSpecies): Nsp = len(MapSpecies) # Number of species K = B.shape[1] # Number of clusters # Find flattened genes in species flatGenesInSpecies = [[ ds.flattenAList(Map[B[:, k], sp]) for k in range(K) ] for sp in range(Nsp)] # Nsp x K lists # Prepare the results object Csizes = [[len(sp_k_genes) for sp_k_genes in sp_genes] for sp_genes in flatGenesInSpecies] # Nsp x K maxCsizes = [np.max(csizes_sp) for csizes_sp in Csizes] # Nsp x 1 res = np.array([None] * Nsp, dtype=object) resFrames = np.array([None] * Nsp, dtype=object) # Fill the results object, species by species for sp in range(Nsp): restmp = np.array(np.empty([maxCsizes[sp], K], dtype=str), dtype=object) header = np.array([None] * K, dtype=object).reshape([1, K]) for k in range(K): restmp[0:Csizes[sp][k], k] = flatGenesInSpecies[sp][k] restmp[Csizes[sp][k]:, k] = '' header[0, k] = 'C{0} ({1} genes)'.format(k, Csizes[sp][k]) res[sp] = np.array(np.concatenate((header, restmp), axis=0), dtype=str) resFrames[sp] = pd.DataFrame(data=res[sp], columns=None, index=None, dtype=str) return resFrames
def calculateGDMandUpdateDatasets(X, Genes, Map=None, mapheader=True, OGsFirstColMap=True, delimGenesInMap='\\W+', OGsIncludedIfAtLeastInDatasets=1): Xloc = ds.listofarrays2arrayofarrays(X) Genesloc = deepcopy(Genes) if Map is None: OGsDatasets = deepcopy(Genes) OGs = np.unique(ds.flattenAList(OGsDatasets)) # Unique list of genes (or mapped genes) MapNew = None MapSpecies = None else: (OGs, OGsDatasets, MapNew, MapSpecies) = mapGenesToCommonIDs(Genes, Map, mapheader, OGsFirstColMap, delimGenesInMap) L = len(Genesloc) # Number of datasets # Ng = len(OGs) # Number of unique genes GDMall = np.transpose([np.in1d(OGs, gs) for gs in OGsDatasets]) # GDM: (Ng)x(L) boolean # Exclude OGs that do not exist in at least (OGsIncludedIfAtLeastInDatasets) datasets IncludedOGs = np.sum(GDMall, axis=1) >= OGsIncludedIfAtLeastInDatasets GDM = GDMall[IncludedOGs] OGs = OGs[IncludedOGs] if MapNew is not None: MapNew = MapNew[IncludedOGs] Ngs = np.sum(GDM, axis=0) # Numbers of unique mapped genes in each dataset Xnew = np.array([None] * L, dtype=object) GenesDatasets = np.array([None] * L, dtype=object) for l in range(L): arelogs = arelogs_function(Xloc[l]) #arelogs = np.nansum(abs(Xloc[l][~isnan(Xloc[l])]) < 30) > 0.98 * ds.numel(Xloc[l][~isnan(Xloc[l])]) # More than 98% of values are below 30.0 d = Xloc[l].shape[1] # Number of dimensions (samples) in this dataset Xnew[l] = np.zeros([Ngs[l], d], dtype=float) GenesDatasets[l] = np.empty(Ngs[l], dtype=object) OGsInThisDS = OGs[GDM[:, l]] # Unique OGs in this dataset # TODO: Optimise the code below by exploiting ds.findArrayInSubArraysOfAnotherArray1D (like in line 203 above) for ogi in range(len(OGsInThisDS)): og = OGsInThisDS[ogi] if arelogs: Xnew[l][ogi] = np.log2(np.sum(np.power(2.0, Xloc[l][np.in1d(OGsDatasets[l], og)]), axis=0)) else: Xnew[l][ogi] = np.sum(Xloc[l][np.in1d(OGsDatasets[l], og)], axis=0) GenesDatasets[l][ogi] = ds.concatenateStrings(Genesloc[l][np.in1d(OGsDatasets[l], og)]) return Xnew, GDM, GDMall, OGs, MapNew, MapSpecies