Exemple #1
0
    def _sortData(self):

        if not hasattr(self, "dataSorted"):
            tmpfil = self.make_tempfile()
            mydict = h5dict(tmpfil, 'w')
            data = mydict.add_empty_dataset("sortedData", (self.N, ), mydtype)
            tmp = mydict.add_empty_dataset("trash", (self.N, ), mydtype)
            code = dedent("""
            a = np.empty(len(chrms1), dtype = mydtype)
            mask = (chrms1 > chrms2) | ( (chrms1 == chrms2) & (cuts1 > cuts2))

            chrms2[mask],chrms1[mask] = chrms1[mask].copy(), chrms2[mask].copy()
            cuts1[mask],cuts2[mask] = cuts2[mask].copy(), cuts1[mask].copy()
            strands1[mask],strands2[mask] = strands2[mask].copy(),strands1[mask].copy()

            a["chrms1"] = chrms1
            a["pos1"] = cuts1
            a["chrms2"] = chrms2
            a["pos2"] = cuts2
            a["strands1"] = strands1
            a["strands2"] = strands2
            """)
            self.evaluate(expression=code,
                          internalVariables=[
                              "chrms1", "chrms2", "cuts1", "cuts2", "strands1",
                              "strands2"
                          ],
                          constants={
                              "np": np,
                              "mydtype": mydtype
                          },
                          outVariable=("a", data))

            externalMergeSort(data,
                              tmp,
                              sorter=mydtypeSorter,
                              searchsorted=searchsorted,
                              chunkSize=max(150000000, self.chunksize))
            sdata = mydict.get_dataset("sortedData")

            c1 = self.h5dict.get_dataset("chrms1")
            c2 = self.h5dict.get_dataset("chrms2")
            p1 = self.h5dict.get_dataset("cuts1")
            p2 = self.h5dict.get_dataset("cuts2")
            s1 = self.h5dict.get_dataset("strands1")
            s2 = self.h5dict.get_dataset("strands2")

            for start, end in self._getChunks():
                data = sdata[start:end]
                c1[start:end] = data["chrms1"]
                c2[start:end] = data["chrms2"]
                p1[start:end] = data["pos1"]
                p2[start:end] = data["pos2"]
                s1[start:end] = data["strands1"]
                s2[start:end] = data["strands2"]
            self.dataSorted = True
            del mydict
            os.remove(tmpfil)
            gc.collect()
Exemple #2
0
    def filterDuplicates(self, chunkSize=50000000):

        if self.N > 200000000:
            mode = "hdd"
        else:
            mode = "ram"

        if mode == "ram":
            dups = np.zeros((self.N, 2), dtype="int64", order="C")
            dups[:, 0] = self.chrms1
            dups[:, 0] *= self.fragIDmult
            dups[:, 0] += self.cuts1
            dups[:, 1] = self.chrms2
            dups[:, 1] *= self.fragIDmult
            dups[:, 1] += self.cuts2
            dups.sort(axis=1)
            dups.shape = (self.N * 2)
            strings = dups.view("|S16")
            # Converting two indices to a single string to run unique
            uids = uniqueIndex(strings)
            del strings, dups
            stay = np.zeros(self.N, bool)
            stay[uids] = True  # indexes of unique DS elements
            del uids
        else:
            tmpfil = self.make_tempfile()
            a = h5dict(tmpfil)
            a.add_empty_dataset("duplicates", (self.N, ), dtype="|S24")
            a.add_empty_dataset("temp", (self.N, ), dtype="|S24")
            dset = a.get_dataset("duplicates")
            tempdset = a.get_dataset("temp")
            code = dedent("""
            tmp = np.array(chrms1, dtype=np.int64) * fragIDmult + cuts1
            tmp2 = np.array(chrms2, dtype=np.int64) * fragIDmult + cuts2
            newarray = np.zeros((len(tmp),3), dtype = np.int64)
            newarray[:,0] = tmp
            newarray[:,1] = tmp2
            newarray[:,:2].sort(axis=1)
            newarray[:,2] = np.arange(start, end, dtype=np.int64)
            newarray.shape = (3*len(tmp))
            a = np.array(newarray.view("|S24"))
            assert len(a) == len(chrms1)
            """)
            self.evaluate(code, ["chrms1", "cuts1", "chrms2", "cuts2"],
                          constants={
                              "np": np,
                              "fragIDmult": self.fragIDmult
                          },
                          outVariable=("a", dset))
            stay = np.zeros(self.N, bool)
            numutils.externalMergeSort(dset, tempdset, chunkSize=chunkSize)
            bins = list(range(0, self.N - 1000, self.chunksize)) + [self.N - 1]
            for start, end in zip(bins[:-1], bins[1:]):
                curset = dset[start:end + 1]
                curset = curset.view(dtype=np.int64)
                curset.shape = (len(curset) // 3, 3)
                unique = (curset[:-1, 0] != curset[1:, 0]) + (curset[:-1, 1] !=
                                                              curset[1:, 1])
                stay[curset[:, 2][unique]] = True
                if end == self.N - 1:
                    stay[curset[-1, 2]] = True
            del a
            os.remove(tmpfil)

        self.metadata["310_DuplicatedRemoved"] = len(stay) - stay.sum()
        self.maskFilter(stay)