Beispiel #1
0
    def buildAllHeatmap(self, resolution, countDiagonalReads = 'Once'):
        
        # 8 bytes per record + heatmap
        self.genome.setResolution(resolution)
        numBins = self.genome.numBins
        label = self.genome.chrmStartsBinCont[self.chrms1]
        label = np.asarray(label, dtype="int64")
        label += self.mids1 / resolution
        label *= numBins
        label += self.genome.chrmStartsBinCont[self.chrms2]
        label += self.mids2 / resolution
        
        counts = np.bincount(label, minlength=numBins ** 2)
        if len(counts) > numBins ** 2:
            raise StandardError("\nHeatMap exceed length of the genome!")

        counts.shape = (numBins, numBins)
        for i in xrange(len(counts)):
            counts[i, i:] += counts[i:, i]
            counts[i:, i] = counts[i, i:]
        if countDiagonalReads.lower() == "once":
            diag = np.diag(counts)
            fillDiagonal(counts, diag / 2)
        elif countDiagonalReads.lower() == "twice":
            pass
        else:
            raise ValueError("Bad value for countDiagonalReads")
            
        return counts
Beispiel #2
0
    def buildAllHeatmap(self, resolution):

        for start, end in self._getChunks(30000000):
            # 8 bytes per record + heatmap
            self.genome.setResolution(resolution)
            numBins = self.genome.numBins
            label = self.genome.chrmStartsBinCont[self._getVector(
                "chrms1", start, end)]
            label = np.asarray(label, dtype="int64")
            label += (self._getVector("mids1", start, end) //
                      resolution).astype(np.int64)
            label *= numBins
            label += self.genome.chrmStartsBinCont[self._getVector(
                "chrms2", start, end)]
            label += (self._getVector("mids2", start, end) //
                      resolution).astype(np.int64)
            counts = np.bincount(label, minlength=numBins**2)
            if len(counts) > numBins**2:
                raise StandardError("\nHeatMap exceed length of the genome!")

            counts.shape = (numBins, numBins)
            try:
                heatmap += counts  # @UndefinedVariable
            except:
                heatmap = counts

        for i in range(len(heatmap)):
            heatmap[i, i:] += heatmap[i:, i]
            heatmap[i:, i] = heatmap[i, i:]
        diag = np.diag(heatmap)
        fillDiagonal(heatmap, diag / 2)

        return heatmap
Beispiel #3
0
def setMatrilocScaling(inMatriloc, alpha=0):
    inMatriloc = inMatriloc.copy()
    N = len(inMatriloc)
    Pc, mids = getLogBinnedScaling(inMatriloc, isCircular=True)
    for i in range(N):
        fillDiagonal(inMatriloc,
                     np.diagonal(inMatriloc, i) / Pc[i] / (i**(-alpha)), i)

    return np.triu(inMatriloc) + np.triu(inMatriloc).T
Beispiel #4
0
    def saveByChromosomeHeatmap(self,
                                filename,
                                resolution,
                                gInfo,
                                includeTrans=False):

        self.genome.setResolution(resolution)

        mydict = h5dict(filename)

        for chrom in range(self.genome.chrmCount):
            c1 = self.h5dict.get_dataset("chrms1")
            p1 = self.h5dict.get_dataset("cuts1")
            low = h5dictBinarySearch(c1, p1, (chrom, -1), "left")
            high = h5dictBinarySearch(c1, p1, (chrom, 999999999), "right")

            chr1 = self._getVector("chrms1", low, high)
            chr2 = self._getVector("chrms2", low, high)
            pos1 = np.array(self._getVector("mids1", low, high) // resolution,
                            dtype=np.int32)
            pos2 = np.array(self._getVector("mids2", low, high) // resolution,
                            dtype=np.int32)

            assert (chr1 == chrom).all()  # getting sure that bincount worked

            args = np.argsort(chr2)
            chr2 = chr2[args]
            pos1 = pos1[args]
            pos2 = pos2[args]

            for chrom2 in range(chrom, self.genome.chrmCount):
                if (includeTrans == False) and (chrom2 != chrom):
                    continue
                start = np.searchsorted(chr2, chrom2, "left")
                end = np.searchsorted(chr2, chrom2, "right")
                cur1 = pos1[start:end]
                cur2 = pos2[start:end]
                label = np.array(cur1, "int64")
                label *= self.genome.chrmLensBin[chrom2]
                label += cur2
                maxLabel = self.genome.chrmLensBin[chrom] * \
                           self.genome.chrmLensBin[chrom2]
                counts = np.bincount(label, minlength=maxLabel)
                mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1))
                if chrom == chrom2:
                    mymap = mymap + mymap.T
                    fillDiagonal(mymap, np.diag(mymap).copy() / 2)
                mydict["%d %d" % (chrom, chrom2)] = mymap

        mydict['resolution'] = resolution
        mydict['genomeInformation'] = gInfo

        return
def saveAllDatasets():
    """
    An example which saves the heatmap in different colormaps.
    It was used to choose the colormap out of the ones we created.
    """
    if not os.path.exists("savedHeatmaps"):
        os.mkdir("savedHeatmaps")
    heatmaps = ["jet", "fall", "blues" "acidblues"]


    for name, heatmap in zip(names, heatmaps)[::-1]:
        for dataset in datasets:
            hm = "data/dumped/%s-10k_overlap.hm_corrected" % dataset
            plt.figure()
            data = np.loadtxt(hm)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, 1)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1 , -1)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.2, 0)
            plt.imshow(data, origin="lower", cmap=heatmap, interpolation="none", vmin=0, vmax=0.035)
            plt.xticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"])
            plt.yticks([0, 100, 200, 300, 400], ["0", "1Mb", "2Mb", "3Mb", "4Mb"])
            plt.colorbar(orientation="vertical", ticks=[0, 0.01, 0.02, 0.03])
            ax = plt.gca()
            for i, line in enumerate(ax.get_xticklines() + ax.get_yticklines()):
                if i % 2 == 1:  # odd indices
                    line.set_visible(False)
            #plt.show()
            plt.savefig("/home/magus/Dropbox/Caulobacter-chromosome/heatmapsAllFromMax/%s_%s.pdf" % (dataset, name))
Beispiel #6
0
def saveAllDatasets():
    """
    An example which saves the heatmap in different colormaps.
    It was used to choose the colormap out of the ones we created.
    """
    if not os.path.exists("savedHeatmaps"):
        os.mkdir("savedHeatmaps")
    heatmaps = ["jet", "fall", "blues" "acidblues"]

    for name, heatmap in zip(names, heatmaps)[::-1]:
        for dataset in datasets:
            hm = "data/dumped/%s-10k_overlap.hm_corrected" % dataset
            plt.figure()
            data = np.loadtxt(hm)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, 1)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.1, -1)
            fillDiagonal(data, np.mean(np.diag(data, 2)) * 1.2, 0)
            plt.imshow(data,
                       origin="lower",
                       cmap=heatmap,
                       interpolation="none",
                       vmin=0,
                       vmax=0.035)
            plt.xticks([0, 100, 200, 300, 400],
                       ["0", "1Mb", "2Mb", "3Mb", "4Mb"])
            plt.yticks([0, 100, 200, 300, 400],
                       ["0", "1Mb", "2Mb", "3Mb", "4Mb"])
            plt.colorbar(orientation="vertical", ticks=[0, 0.01, 0.02, 0.03])
            ax = plt.gca()
            for i, line in enumerate(ax.get_xticklines() +
                                     ax.get_yticklines()):
                if i % 2 == 1:  # odd indices
                    line.set_visible(False)
            #plt.show()
            plt.savefig(
                "/home/magus/Dropbox/Caulobacter-chromosome/heatmapsAllFromMax/%s_%s.pdf"
                % (dataset, name))
Beispiel #7
0
    def saveByChromosomeHeatmap(self, filename, resolution = 40000,
                                includeTrans = False,
                                countDiagonalReads = "Once"):
        """
        Saves chromosome by chromosome heatmaps to h5dict.
        
        This method is not as memory demanding as saving all x all heatmap.

        Keys of the h5dict are of the format ["1 1"], where chromosomes are
        zero-based, and there is one space between numbers.

        Parameters
        ----------
        filename : str
            Filename of the h5dict with the output
            
        resolution : int
            Resolution to save heatmaps
            
        includeTrans : bool, optional
            Build inter-chromosomal heatmaps (default: False)
            
        countDiagonalReads : "once" or "twice"
            How many times to count reads in the diagonal bin

        """
        if countDiagonalReads.lower() not in ["once", "twice"]:
            raise ValueError("Bad value for countDiagonalReads")
            
        self.genome.setResolution(resolution)
        
        pos1 = self.evaluate("a = np.array(mids1 / {res}, dtype = 'int32')"
                             .format(res=resolution), "mids1")
        pos2 = self.evaluate("a = np.array(mids2 / {res}, dtype = 'int32')"
                             .format(res=resolution), "mids2")
                             
        chr1 = self.chrms1
        chr2 = self.chrms2
        
        # DS = self.DS  # 13 bytes per read up to now, 16 total
        mydict = h5dict(filename)

        for chrom in xrange(self.genome.chrmCount):
            if includeTrans == True:
                mask = ((chr1 == chrom) + (chr2 == chrom))
            else:
                mask = ((chr1 == chrom) * (chr2 == chrom))
            # Located chromosomes and positions of chromosomes
            c1, c2, p1, p2 = chr1[mask], chr2[mask], pos1[mask], pos2[mask]
            if includeTrans == True:
                # moving different chromosomes to c2
                # c1 == chrom now
                mask = (c2 == chrom) * (c1 != chrom)
                c1[mask], c2[mask], p1[mask], p2[mask] = c2[mask].copy(), c1[
                    mask].copy(), p2[mask].copy(), p1[mask].copy()
                del c1  # ignore c1
                args = np.argsort(c2)
                c2 = c2[args]
                p1 = p1[args]
                p2 = p2[args]

            for chrom2 in xrange(chrom, self.genome.chrmCount):
                if (includeTrans == False) and (chrom2 != chrom):
                    continue
                start = np.searchsorted(c2, chrom2, "left")
                end = np.searchsorted(c2, chrom2, "right")
                cur1 = p1[start:end]
                cur2 = p2[start:end]
                label = np.asarray(cur1, "int64")
                label *= self.genome.chrmLensBin[chrom2]
                label += cur2
                maxLabel = self.genome.chrmLensBin[chrom] * \
                           self.genome.chrmLensBin[chrom2]
                counts = np.bincount(label, minlength = maxLabel)
                assert len(counts) == maxLabel
                mymap = counts.reshape((self.genome.chrmLensBin[chrom], -1))
                if chrom == chrom2:
                    mymap = mymap + mymap.T
                    if countDiagonalReads.lower() == "once":
                        fillDiagonal(mymap, np.diag(mymap).copy() / 2)
                mydict["%d %d" % (chrom, chrom2)] = mymap
        
        mydict['resolution'] = resolution

        return
Beispiel #8
0
    def saveHiResHeatmapWithOverlaps(self,
                                     filename,
                                     resolution=10000,
                                     countDiagonalReads="Once",
                                     maxBinSpawn=10,
                                     chromosomes="all"):
        """
        Creates within-chromosome heatmaps at very high resolution,
        assigning each fragment to all the bins it overlaps with,
        proportional to the area of overlaps.

        Parameters
        ----------
        resolution : int or str
            Resolution of a heatmap.
            
        countDiagonalReads : "once" or "twice"
            How many times to count reads in the diagonal bin
            
        maxBinSpawn : int, optional, not more than 10
            Discard read if it spawns more than maxBinSpawn bins

        """
        from scipy import weave

        tosave = h5dict(filename)

        self.genome.setResolution(resolution)

        if chromosomes == "all":
            chromosomes = range(self.genome.chrmCount)

        for chrom in chromosomes:
            mask = (self.chrms1 == chrom) * (self.chrms2 == chrom)

            if mask.sum() == 0:
                continue

            low1 = (self.mids1[mask] -
                    self.fraglens1[mask] / 2) / float(resolution)

            high1 = (self.mids1[mask] +
                     self.fraglens1[mask] / 2) / float(resolution)

            low2 = (self.mids2[mask] -
                    self.fraglens2[mask] / 2) / float(resolution)

            high2 = (self.mids2[mask] +
                     self.fraglens2[mask] / 2) / float(resolution)

            del mask

            N = len(low1)

            heatmapSize = int(self.genome.chrmLensBin[chrom])

            heatmap = np.zeros((heatmapSize, heatmapSize),
                               dtype="float64",
                               order="C")

            code = """
            double vector1[100];
            double vector2[100];

            for (int readNum = 0;  readNum < N; readNum++)
            {
                for (int i=0; i<10; i++)
                {
                    vector1[i] = 0;
                    vector2[i] = 0;
                }

                double l1 = low1[readNum];
                double l2 = low2[readNum];
                double h1 = high1[readNum];
                double h2 = high2[readNum];


                if ((h1 - l1) > maxBinSpawn) continue;
                if ((h2 - l2) > maxBinSpawn) continue;

                int binNum1 = ceil(h1) - floor(l1);
                int binNum2 = ceil(h2) - floor(l2);
                double binLen1 = h1 - l1;
                double binLen2 = h2 - l2;

                int b1 = floor(l1);
                int b2 = floor(l2);

                if (binNum1 == 1)
                    vector1[0] = 1.;
                else
                    {
                    vector1[0] = (ceil(l1 + 0.00001) - l1) / binLen1;
                    for (int t = 1; t< binNum1 - 1; t++)
                        {vector1[t] = 1. / binLen1;}
                    vector1[binNum1 - 1] = (h1 - floor(h1)) / binLen1;
                    }

                if (binNum2 == 1) vector2[0] = 1.;

                else
                    {
                    vector2[0] = (ceil(l2 + 0.0001) - l2) / binLen2;
                    for (int t = 1; t< binNum2 - 1; t++)
                        {vector2[t] = 1. / binLen2;}
                    vector2[binNum2 - 1] = (h2 - floor(h2)) / binLen2;
                    }

                for (int i = 0; i< binNum1; i++)
                    {
                    for (int j = 0; j < binNum2; j++)
                        {
                        heatmap[(b1 + i) * heatmapSize +  b2 + j] += vector1[i] * vector2[j];
                        }
                    }
                }
                
            """
            weave.inline(code, [
                'low1',
                "high1",
                "low2",
                "high2",
                "N",
                "heatmap",
                "maxBinSpawn",
                "heatmapSize",
            ],
                         extra_compile_args=['-march=native  -O3 '],
                         support_code=r"""
                        #include <stdio.h>
                        #include <math.h>""")
            del high1, low1, high2, low2

            for i in xrange(len(heatmap)):
                heatmap[i, i:] += heatmap[i:, i]
                heatmap[i:, i] = heatmap[i, i:]

            if countDiagonalReads.lower() == "once":
                diag = np.diag(heatmap).copy()
                fillDiagonal(heatmap, diag / 2)
                del diag
            elif countDiagonalReads.lower() == "twice":
                pass
            else:
                raise ValueError("Bad value for countDiagonalReads")
            tosave["{0} {0}".format(chrom)] = heatmap
            tosave.flush()
            del heatmap
            weave.inline("")  # to release all buffers of weave.inline
            import gc
            gc.collect()

        tosave['resolution'] = resolution
Beispiel #9
0
def cis_eig(A, k=3, robust=True, gc=None, classic=False):
    """
    Compute compartment eigenvector on a cis matrix
    Parameters
    ----------
    A : 2D array
        balanced whole genome contact matrix
    k : int
        number of eigenvectors to compute; default = 3
    robust : bool
        Clip top 0.1 percentile and smooth first two diagonals
    gc : 1D array, optional
        GC content per bin for choosing and orienting the primary compartment 
        eigenvector; not performed if no array is provided
    classic : bool
        Do it old-school
    Returns
    -------
    eigenvalues, eigenvectors
    """
    A = np.array(A)
    A[~np.isfinite(A)] = 0

    mask = A.sum(axis=0) > 0

    if A.shape[0] <= 5 or mask.sum() <= 5:
        return (np.array([np.nan for i in range(k)]),
                np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]))

    if robust:
        A = np.clip(A, 0, np.percentile(A, 99.9))
        fill_value = np.mean(np.diag(A, 2) * 2)
        for d in [-1, 0, 1]:
            numutils.fillDiagonal(A, fill_value, d)
            A[~mask, :] = 0
            A[:, ~mask] = 0

    OE = numutils.observedOverExpected(A[mask, :][:, mask])

    if robust:
        OE = np.clip(OE, 0, np.percentile(OE, 99.9))

    if classic:
        OE = numutils.iterativeCorrection(OE)[0]
        if (~np.isfinite(OE)).sum() > 0:
            return (
                np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]),
                np.array([np.nan for i in range(k)]),
            )
        # mean-centered (subtract mean)
        eigvecs_compressed, eigvals = numutils.EIG(OE, k)
    else:
        eigvecs_compressed, eigvals = numutils.EIG((OE - 1.0),
                                                   k,
                                                   subtractMean=False,
                                                   divideByMean=False)

    # Restore full eigs
    eigvecs = []
    for i in range(k):
        v = np.ones(mask.shape[0]) * np.nan
        v[mask] = eigvecs_compressed[i]
        eigvecs.append(v)
    eigvecs = np.array(eigvecs)

    # Orient and reorder
    eigvals, eigvecs = _orient_eigs(eigvals, eigvecs, gc)

    return eigvals, eigvecs
Beispiel #10
0
def Generate_one_chromosome_file(chrNumb):

    o_file = base_out_folder + "fitHiC/i_files/" + base_filename + ".fithic"
    fragment_dataset_filename = base_out_folder + "fitHiC/i_files/" + 'fragment_dataset_' + base_filename + '_chr' + str(
        chrNumb) + '.hdf5'

    if not os.path.isfile(fragment_dataset_filename):
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='w')
        fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True)

        fragments.filterRsiteStart(offset=5)
        fragments.filterDuplicates()
        fragments.filterLarge()
        fragments.filterExtreme(cutH=0.005, cutL=0)
    else:
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='a')

    print "Filtering, leaving only chr ", genome_db.idx2label[chrNumb]
    #leave only frgaments from the chrNumb (nterchromosomal)
    fragments.maskFilter((fragments.chrms1 == chrNumb))
    fragments.maskFilter((fragments.chrms2 == chrNumb))

    print "Seting RE"
    #Setting info about restriction enzyme, calculating absolute indexes
    fragments.setRfragAbsIdxs('HindIII')

    numBins = len(fragments.genome.rsites[chrNumb])
    print "Total numBins (RSites) on chr ", genome_db.idx2label[
        chrNumb], " = ", numBins

    rfragAbsIdxs1 = fragments.rfragAbsIdxs1 - fragments.genome.chrmStartsRfragCont[
        chrNumb]
    rfragAbsIdxs2 = fragments.rfragAbsIdxs2 - fragments.genome.chrmStartsRfragCont[
        chrNumb]
    print "Total number of fragments = ", len(rfragAbsIdxs1)

    if len(rfragAbsIdxs1) != len(rfragAbsIdxs2):
        print "rfragAbsIdxs1=", rfragAbsIdxs1
        print "rfragAbsIdxs2=", rfragAbsIdxs2
        print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1)
        print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 1!!!"
    if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0):
        print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1)
        print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 2!!!"
    if (max(rfragAbsIdxs1) > numBins - 1 or max(rfragAbsIdxs2) > numBins - 1):
        print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1)
        print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2)
        print "numBins=", numBins
        raise "FRAGMENT INDEXING ERROR 3!!!"

    print "FRAGMENT INDEXING - passed"

    #Creating label array
    label = np.array(rfragAbsIdxs1, dtype='int64')
    label *= numBins
    label += rfragAbsIdxs2

    #Creating count array
    counts = np.bincount(label, minlength=numBins**2)
    counts.shape = (numBins, numBins)

    #Counting
    for i in xrange(len(counts)):
        counts[i, i:] += counts[i:, i]
        counts[i:, i] = counts[i, i:]

    #Filling diagonal reads

    #diag = np.diag(counts)
    #fillDiagonal(counts, diag/2)
    fillDiagonal(counts, 0)

    BinsToDescribe = np.zeros(
        numBins
    )  # Info about which RSites should be described in .fragments file later

    #	f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".contacts.zip","w")
    f_out = open(o_file + "_chr" + str(chrNumb) + ".contacts.zip", "w")
    print "Writing file ", o_file + "_chr" + str(chrNumb) + ".contacts.zip"
    for i in range(numBins - 1):
        for j in range(i + 1, numBins):
            if (counts[i, j] != 0):
                s = ""
                s += str(chrNumb) + "\t"
                s += str(fragments.genome.rfragMids[chrNumb][i]) + "\t"
                s += str(chrNumb) + "\t"
                s += str(fragments.genome.rfragMids[chrNumb][j]) + "\t"
                s += str(counts[i, j]) + "\n"
                f_out.write(s)
                BinsToDescribe[i] = 1
                BinsToDescribe[j] = 1

    f_out.close()

    #	f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".fragments.zip","w")
    f_out = open(o_file + "_chr" + str(chrNumb) + ".fragments.zip", "w")
    print "Writing file ", o_file + "_chr" + str(chrNumb) + ".fragments.zip"

    for ind, val in enumerate(BinsToDescribe):
        if (val == 1):
            s = ""
            s += str(chrNumb) + "\t0\t"
            s += str(fragments.genome.rfragMids[chrNumb][ind]) + "\t"
            s += str(sum(counts[ind])) + "\t"
            s += "1\n"
            f_out.write(s)
    f_out.close()