def _sortData(self): if not hasattr(self, "dataSorted"): tmpfil = self.make_tempfile() mydict = h5dict(tmpfil, 'w') data = mydict.add_empty_dataset("sortedData", (self.N, ), mydtype) tmp = mydict.add_empty_dataset("trash", (self.N, ), mydtype) code = dedent(""" a = np.empty(len(chrms1), dtype = mydtype) mask = (chrms1 > chrms2) | ( (chrms1 == chrms2) & (cuts1 > cuts2)) chrms2[mask],chrms1[mask] = chrms1[mask].copy(), chrms2[mask].copy() cuts1[mask],cuts2[mask] = cuts2[mask].copy(), cuts1[mask].copy() strands1[mask],strands2[mask] = strands2[mask].copy(),strands1[mask].copy() a["chrms1"] = chrms1 a["pos1"] = cuts1 a["chrms2"] = chrms2 a["pos2"] = cuts2 a["strands1"] = strands1 a["strands2"] = strands2 """) self.evaluate(expression=code, internalVariables=[ "chrms1", "chrms2", "cuts1", "cuts2", "strands1", "strands2" ], constants={ "np": np, "mydtype": mydtype }, outVariable=("a", data)) externalMergeSort(data, tmp, sorter=mydtypeSorter, searchsorted=searchsorted, chunkSize=max(150000000, self.chunksize)) sdata = mydict.get_dataset("sortedData") c1 = self.h5dict.get_dataset("chrms1") c2 = self.h5dict.get_dataset("chrms2") p1 = self.h5dict.get_dataset("cuts1") p2 = self.h5dict.get_dataset("cuts2") s1 = self.h5dict.get_dataset("strands1") s2 = self.h5dict.get_dataset("strands2") for start, end in self._getChunks(): data = sdata[start:end] c1[start:end] = data["chrms1"] c2[start:end] = data["chrms2"] p1[start:end] = data["pos1"] p2[start:end] = data["pos2"] s1[start:end] = data["strands1"] s2[start:end] = data["strands2"] self.dataSorted = True del mydict os.remove(tmpfil) gc.collect()
def filterDuplicates(self, chunkSize=50000000): if self.N > 200000000: mode = "hdd" else: mode = "ram" if mode == "ram": dups = np.zeros((self.N, 2), dtype="int64", order="C") dups[:, 0] = self.chrms1 dups[:, 0] *= self.fragIDmult dups[:, 0] += self.cuts1 dups[:, 1] = self.chrms2 dups[:, 1] *= self.fragIDmult dups[:, 1] += self.cuts2 dups.sort(axis=1) dups.shape = (self.N * 2) strings = dups.view("|S16") # Converting two indices to a single string to run unique uids = uniqueIndex(strings) del strings, dups stay = np.zeros(self.N, bool) stay[uids] = True # indexes of unique DS elements del uids else: tmpfil = self.make_tempfile() a = h5dict(tmpfil) a.add_empty_dataset("duplicates", (self.N, ), dtype="|S24") a.add_empty_dataset("temp", (self.N, ), dtype="|S24") dset = a.get_dataset("duplicates") tempdset = a.get_dataset("temp") code = dedent(""" tmp = np.array(chrms1, dtype=np.int64) * fragIDmult + cuts1 tmp2 = np.array(chrms2, dtype=np.int64) * fragIDmult + cuts2 newarray = np.zeros((len(tmp),3), dtype = np.int64) newarray[:,0] = tmp newarray[:,1] = tmp2 newarray[:,:2].sort(axis=1) newarray[:,2] = np.arange(start, end, dtype=np.int64) newarray.shape = (3*len(tmp)) a = np.array(newarray.view("|S24")) assert len(a) == len(chrms1) """) self.evaluate(code, ["chrms1", "cuts1", "chrms2", "cuts2"], constants={ "np": np, "fragIDmult": self.fragIDmult }, outVariable=("a", dset)) stay = np.zeros(self.N, bool) numutils.externalMergeSort(dset, tempdset, chunkSize=chunkSize) bins = list(range(0, self.N - 1000, self.chunksize)) + [self.N - 1] for start, end in zip(bins[:-1], bins[1:]): curset = dset[start:end + 1] curset = curset.view(dtype=np.int64) curset.shape = (len(curset) // 3, 3) unique = (curset[:-1, 0] != curset[1:, 0]) + (curset[:-1, 1] != curset[1:, 1]) stay[curset[:, 2][unique]] = True if end == self.N - 1: stay[curset[-1, 2]] = True del a os.remove(tmpfil) self.metadata["310_DuplicatedRemoved"] = len(stay) - stay.sum() self.maskFilter(stay)