Python binnedData Exemples, hiclib.binnedData.binnedData Python Exemples

Exemple #1

0

Afficher le fichier

def plotCrossValidation():
    "main figure subplot with corss-validation"
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    plt.figure(figsize=(1, 1))
    FG = HiCdataset(workingFile1, myGenome)
    FG.load(GMFrag)

    Tanay = binnedData(1000000)
    Tanay.simpleLoad("GM-all-10p", "GM-1")
        #need to create these datasets using fragment-level analysis
    Tanay.simpleLoad("GM-all-90p", "GM-9")
    Tanay.removePoorRegions()
    Tanay.iterativeCorrectWithSS()
    Tanay.removeZeros()
    b1, b2 = (Tanay.biasDict["GM-1"], Tanay.biasDict["GM-9"])
    cPickle.dump((b1, b2), open("CrossValidatioN", 'wb'))
    ax = plt.gca()
    b1, b2 = cPickle.load(open("CrossValidatioN", 'rb'))
    print cr(b1, b2)
    plt.scatter(b1, b2, s=.7, color="k", linewidth=0)
    plt.xlabel(r"10% reads", fontsize=8)
    plt.ylabel(r"90% reads", fontsize=8)
    plt.xlim((0, 1.5))
    plt.ylim((0, 1.5))
    plt.xticks([0, 0.5, 1, 1.5])
    plt.yticks([0, 0.5, 1, 1.5])
    removeAxes(shift=0)
    fs = 6
    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(fs)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(fs)
    plt.show()

Exemple #2

0

Afficher le fichier

def CreateMatrixFile():

    BD = binnedData.binnedData(domain_res, genome_db)
    BD.simpleLoad(heatmap_filepath, 'heatmap')
    print "Writing file %s with the information \n" % (heatmap_filepath +
                                                       '.matrix')
    print "Format:\nChrIndex \t StartBin(Nucleotyde) \t EndBin(Nucl) \t Values\n"
    f = open(heatmap_filepath + '.matrix', 'w')
    for i in range(len(BD.dataDict['heatmap'])):
        strToWrite = ""
        curChrmIdx = genome_db.chrmIdxBinCont[i]
        if curChrmIdx == 0:
            curRelativeBinNumb = i
        else:
            curRelativeBinNumb = i - genome_db.chrmLensBin[0:curChrmIdx].sum()

        strToWrite += str(curChrmIdx) + "\t" + str(
            genome_db.posBinCont[i]) + "\t" + str(
                genome_db.posBinCont[i] +
                genome_db.binSizesBp[curChrmIdx][curRelativeBinNumb])
        for j in range(len(BD.dataDict['heatmap'])):
            strToWrite += "\t" + str(BD.dataDict['heatmap'][i][j])
        strToWrite += "\n"
        f.write(strToWrite)
    f.close()

Exemple #3

0

Afficher le fichier

Fichier : hiclibCorrelate.py Projet : yanding/ngsane

def calculateTanayCorrelation(
    resolution, filename1, filename2, experiment1, experiment2, genome, outfile, mouse=False, **kwargs
):
    "Calculates correlation between datasets, smoothed in a Tanay way"

    global pp
    if options.verbose:
        print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution,
            filename1,
            filename2,
            experiment1,
            experiment2,
            genome,
        )

    BD = binnedData(resolution, genome)
    BD.simpleLoad(filename1, experiment1)
    BD.simpleLoad(filename2, experiment2)

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1.0 + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix, weights=mat, mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict[experiment1]
    data2 = BD.dataDict[experiment2]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask])
    outfile.write("Spearman corrleation	%s	%s %.4f	%.4f" % (filename1, filename2, scorr, pvalue))

Exemple #4

0

Afficher le fichier

Fichier : run_hiclib.py Projet : Gurado/pipelines

def iterativeFiltering(genome_db, fragments):
	'''
	Filter the data at the binned level and perform the iterative correction.
	'''
	
	# Read resolution from the dataset.
	raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') 
	resolution = int(raw_heatmap['resolution'])
	
	# Create a binnedData object, load the data.
	BD = binnedData.binnedData(resolution, genome_db)
	BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment)

	# Remove the contacts between loci located within the same bin.
	BD.removeDiagonal()
	
	# Remove bins with less than half of a bin sequenced.
	BD.removeBySequencedCount(0.5)
	
	# Remove 1% of regions with low coverage.
	BD.removePoorRegions(cutoff=1)
	
	# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
	BD.truncTrans(high=0.0005)
	
	# Perform iterative correction.
	BD.iterativeCorrectWithoutSS()

	# Save the iteratively corrected heatmap.
	BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5')

	plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))

Exemple #5

0

Afficher le fichier

Fichier : imakaev2012nm.py Projet : huxihao/BNMF

def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') 
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        outfile.write("\t%s"%BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()

Exemple #6

0

Afficher le fichier

Fichier : getIntraChrHeatmaps.py Projet : labdevgen/FishHiC

def get_chromosomes(hm_file, genome_db, resolution, chrNumb=None):
    if extractResolutionFromFileName(hm_file) != resolution:
        print "WARNING! Provided resolution ", resolution, "does not match ", extractResolutionFromFileName(
            hm_file), "extracted from file name ", hm_file
    if "hiRes.hm" in hm_file:
        type = "HiRes"
    elif "bychr.hm" in hm_file:
        type = "bychr"
    else:
        print "Warning: cannot resolve type of data from filename"
        try:
            print "Warning: trying hires hic"
            raw_heatmap = h5dict.h5dict(fname, mode='r')  #open heatmap
            if "0 0" in raw_heatmap.keys():
                type = "HiRes"
            else:
                print "HiRes hic Failed! Assuming bychr type"
                type = "bychr"
        except:
            print "HiRes hic Failed! Assuming bychr type"
            type = "bychr"
    if type == "HiRes":
        from hiclib import highResBinnedData
        # Create a  object, load the data.
        print "creating an object"
        hmap = highResBinnedData.HiResHiC(genome_db, resolution)
        print "loading data"
        hmap.loadData(hm_file, mode="cis")
        print "Data loaded"
        if chrNumb != None:
            return hmap.data[(chrNumb, chrNumb)].getData()
        return [
            hmap.data[(i, i)].getData() for i in xrange(genome_db.chrmCount)
        ]
        #cisKeys are tuples like (N,N) where N is 0..Number_of_chrms-1
    elif type == "bychr":
        from hiclib import binnedData
        print "creating an object"
        hmap = binnedData.binnedData(resolution, genome_db)

        print "loading data"
        hmap.simpleLoad(hm_file, "heatmap")
        data = hmap.dataDict["heatmap"]
        assert len(data) == genome_db.numBins
        print "Data loaded"
        if chrNumb != None:
            return data[genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb],
                        genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb]]
        return [
            data[genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i],
                 genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i]]
            for i in xrange(genome_db.chrmCount)
        ]
    else:
        raise "Error: can not recognize heatmap format from file name"

Exemple #7

0

Afficher le fichier

def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()

Exemple #8

0

Afficher le fichier

Fichier : imakaev2012nm.py Projet : huxihao/BNMF

def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r')  
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')
    
    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True) ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000))
    plt.clf()

    outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        for eigenvector in eig_v:
            outfile.write("\t%s"%eigenvector[i])
        outfile.write("\n")
    outfile.close()

Exemple #9

0

Afficher le fichier

def calculateTanayCorrelation():
    "Calculates correlation between datasets, smoothed in a Tanay way"
    BD = binnedData(1000000, "../../../data/hg18")
    BD.simpleLoad("../../../ErezPaperData/hg18/GM-HindIII-hg18-1M.hm",
                  "HindIII")
    BD.simpleLoad("../../../ErezPaperData/hg18/GM-NcoI-hg18-1M.hm", "NcoI")

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix,
                                              weights=mat,
                                              mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict["HindIII"]
    data2 = BD.dataDict["NcoI"]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    print scipy.stats.spearmanr(d1[cormask], d2[cormask])

Exemple #10

0

Afficher le fichier

def saddlePlot():
    "plot of values ordered by Eig1GW"

    #plt.figure(figsize = (1.5,1.5))
    plt.figure(figsize=(3, 3))
    Tanay = binnedData(1000000)
    Tanay.simpleLoad("../data/GM-all-hg18-1M", "GM-all")
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    Tanay.fakeCis()
    Tanay.iterativeCorrectWithoutSS()
    Tanay.doEig()
    PC = Tanay.EIG["GM-all"][:, 0]
    if PC[0] > 0:
        PC = -PC

    def reorder(data, array=PC):
        inds = numpy.argsort(array)
        ndata = data[inds, :]
        return ndata[:, inds]
    toplot = (coarsegrain(reorder(Tanay.dataDict["GM-all"]), 60))
    toplot /= toplot.mean()
    toplot = numpy.log(toplot)
    sh = toplot.shape
    toplot = toplot.reshape((-1))
    ag = numpy.argmax(toplot)
    toplot[ag] = 0
    toplot[ag] = numpy.max(toplot)
    toplot.shape = sh
    toplot[0, -1] = toplot[0, -2]
    toplot[-1, 0] = toplot[-2, 0]
    plt.imshow(toplot, vmin=toplot.min(), vmax=toplot.max(),
               interpolation="nearest")
    cbar = plt.colorbar(orientation="vertical")
    #labels = ["10","100","1000","10000"]
    #cbar.ax.set_xticklabels(labels)
    cbar.ax.set_xlabel("Log(relative contact probability)", fontsize=6)
    for xlabel_i in cbar.ax.get_xticklabels():
        xlabel_i.set_fontsize(6)
    cbar.set_ticks([-0.5, 0, 0.5, 1])
    removeBorder()
    mirnylib.plotting.niceShow()

Exemple #11

0

Afficher le fichier

def iterativeFiltering(genome_db, filesuffix):
    '''
	Filter the data at the binned level and perform the iterative correction.
	'''

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment +
                                filesuffix,
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(options.outputDir + options.experiment + filesuffix,
                  options.experiment)

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Remove empty bins
    BD.removeZeros()

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export(options.experiment,
              options.outputDir + options.experiment + '-IC' + filesuffix)

    plt.figure()
    plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
    pp.savefig()

Exemple #12

0

Afficher le fichier

Fichier : singleShared.py Projet : llity/hiclib-legacy

def doEigenvector(filename, genome):
    if filename == "GC":
        gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"])
        gen.setResolution(1000000)
        GC = np.concatenate(gen.GCBin)
        return GC
    resolution = getResolution(filename)
    BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"])

    BD.simpleLoad(filename, "bla")
    BD.removeDiagonal()

    BD.removeBySequencedCount(0.5)

    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=2)
    BD.restoreZeros(value=0)
    return BD.EigDict["bla"][0]

Exemple #13

0

Afficher le fichier

def CorrectHeatMap():
    # Read resolution from the dataset.
    print "Loading raw heatmap\n"
    raw_heatmap = h5dict.h5dict(heatmap_filepath + '-raw', mode='r')
    resolution = int(raw_heatmap['resolution'])

    ####### Set resolution for genome
    #genome_db.setResolution(resolution)

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(heatmap_filepath + '-raw', 'HindIII_GM_1')
    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()
    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)
    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)
    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()
    # Save the iteratively corrected heatmap.
    BD.export('HindIII_GM_1', heatmap_filepath)

Exemple #14

0

Afficher le fichier

Fichier : iterativeCorrection.py Projet : bxlab/HiFive_Paper

    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage
BD.removePoorRegions(cutoff=1)

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
BD.truncTrans(high=0.0005)

# Actually performe iterative correction

Exemple #15

0

Afficher le fichier

Fichier : identify-compartments.py Projet : v587dexinxin/Analysis-scripts

source = os.path.join(HiCFolder,
                      sys.argv[1])  # file after calling runHiC binning
outfile = sys.argv[2]

# params from source
lib = h5dict(source, 'r')
res = lib['resolution']
gInfo = lib['genomeInformation']
genomeFolder = os.path.join(gInfo['dataFolder'], gInfo['genomeName'])

# Load binned data and perform PCA ...
genome_db = myGenome(genomeFolder,
                     readChrms=gInfo['chroms'],
                     chrmFileTemplate=gInfo['template'],
                     gapFile=gInfo['gapFile'])
BD = binnedData.binnedData(res, genome_db)
name = os.path.split(source)[1].split('-')[0]
BD.simpleLoad(source, name)
BD.doCisPCADomains(3)
# Identify compartments ...
pcas = BD.PCDict[name].T
idx2label = gInfo['idx2label']
for i in idx2label:
    label = idx2label[i]
    mask = BD.chromosomeIndex == i
    tmp = pcas[mask]
    compartments = compartmentFromPCA(tmp[:, 0], res, label)
    with open(outfile, 'a') as output:
        for c in compartments:
            line = '\t'.join(map(str, c)) + '\n'
            output.write(line)

Exemple #16

0

Afficher le fichier

base_folder = '/mnt/storage/home/vsfishman/HiC/data/'
base_filename = 'ESC_full'

heatmap_filepath = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename + '.hdf5'
raw_heatmap_filepath = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename + '.hdf5'
maped_reads_filepath = base_folder + 'mapped_reads_' + base_filename + '.hdf5'
figure_path = base_folder + base_filename + "_" + str(
    domain_res / 1000) + 'kb-Xist.png'

genome_fai_filepath = '../../fasta/' + genome_name + '/' + genome_name + '.fai'

print "Loading file " + heatmap_filepath
BD = binnedData.binnedData(domain_res, genome_db)
BD.simpleLoad(heatmap_filepath, 'HindIII_GM_1')
BD_raw = binnedData.binnedData(domain_res, genome_db)
BD_raw.simpleLoad(heatmap_filepath, 'HindIII_GM_1')

q = BD.dataDict['HindIII_GM_1']
q_raw = BD_raw.dataDict['HindIII_GM_1']

X_values = []
Y_values = []
Y_errors = []

binnumber = 100
dist = -1

start = sum(genome_db.chrmLensBin[0:19])

Exemple #17

0

Afficher le fichier

Fichier : hiclibCorrelate.py Projet : yanding/ngsane

def plotDiagonalCorrelation(resolution, filename1, filename2, experiment1, experiment2, genome, mouse=False, **kwargs):
    "Correlation of diagonal bins - paper figure"
    global pp

    if options.verbose:
        print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution,
            filename1,
            filename2,
            experiment1,
            experiment2,
            genome,
        )

    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(resolution, genome)
    Tanay.simpleLoad(filename1, experiment1)
    Tanay.simpleLoad(filename2, experiment2)
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()

    pairs = [(experiment1, experiment2)]

    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )

    matplotlib.rcParams["font.sans-serif"] = "Arial"

    print "Eigenvectors"
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(8, 4))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5.0, cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5.0, cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5.0, cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()
    pp.savefig()

Exemple #18

0

Afficher le fichier

from mirnylib.genome import Genome
from hiclib.binnedData import binnedData
from mirnylib.h5dict import h5dict
import numpy as np
import sys
import os

genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"])

a = HiResHiC(genome, 1000000, "hiResDict", mode='w')
a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm")
a.removeDiagonal()
a.removePoorRegions(2)
a.iterativeCorrection(1e-10)

b = binnedData(1000000, genome)

data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]}
lim = b.genome.chrmEndsBinCont[-1]
data["heatmap"] = data["heatmap"][:lim, :lim]

b.simpleLoad(data, "data")
b.removeDiagonal()
b.removePoorRegions(cutoff=2)
b.iterativeCorrectWithoutSS(tolerance=1e-10)
a.export("testExport")

def compareData():
    dataHigh = a.getCombinedMatrix()
    dataLow = b.dataDict["data"]

Exemple #19

0

Afficher le fichier

diags={}

for i in filenames.keys():  
	print "Reading file "+i
#	if (i.split('.')[-1]=='hdf5'):
	if True:
		if (resolution==0): #if we do not know resolution
			raw_heatmap = h5dict.h5dict(i, mode='r') #open heatmap
			resolution = int(raw_heatmap['resolution']) #get the resolution
			del raw_heatmap #close heatmap
		if (genome_db==None): #if we have not initilaize genome before
			genome_db = genome.Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
				readChrms=[],
				chrmFileTemplate="N%s.fa")
		BD = binnedData.binnedData(resolution, genome_db) #now we can initialyze heatmap with defined resolution and genome
		BD.simpleLoad(i, 'heatmap')
		number_of_bins=len(BD.dataDict['heatmap'])
		
		diags[i]=np.zeros(max(genome_db.chrmLensBin))
		for chr in xrange(genome_db.chrmCount):
			for j in xrange(genome_db.chrmLensBin[chr]):
				cur_chr_matrix = BD.dataDict['heatmap'][genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr],genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr]]
				diags[i][j] += sum(np.diag(cur_chr_matrix,j))*2
		print np.sum(diags[i])
		diags[i] = (diags[i]/np.sum(diags[i]))*100.0
		print (diags[i][0:10])/100.0
		del BD		


def contact_freq_total():

Exemple #20

0

Afficher le fichier

Fichier : hiclibCorrelate.py Projet : wyim-pgl/ngsane

def calculateTanayCorrelation(resolution,
                              filename1,
                              filename2,
                              experiment1,
                              experiment2,
                              genome,
                              outfile,
                              mouse=False,
                              **kwargs):
    "Calculates correlation between datasets, smoothed in a Tanay way"

    global pp
    if (options.verbose):
        print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution, filename1, filename2, experiment1, experiment2, genome)

    BD = binnedData(resolution, genome)
    BD.simpleLoad(filename1, experiment1)
    BD.simpleLoad(filename2, experiment2)

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix,
                                              weights=mat,
                                              mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict[experiment1]
    data2 = BD.dataDict[experiment2]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask])
    outfile.write("Spearman corrleation	%s	%s %.4f	%.4f" %
                  (filename1, filename2, scorr, pvalue))

Exemple #21

0

Afficher le fichier



#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference
heatmap_filepath1=sys.argv[1]
heatmap_filepath2=sys.argv[2]

#out_heatmap_filepath2 = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.hdf5'
#figure_path = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.png'

print "Loading file "+heatmap_filepath1
raw_heatmap = h5dict.h5dict(heatmap_filepath1, mode='r') 
res = int(raw_heatmap['resolution'])
print "resolution defined by heatmap: ",res

BD1 = binnedData.binnedData(res, genome_db1)
BD1.simpleLoad(heatmap_filepath1, 'heatmap')

print "Loading file "+heatmap_filepath2
BD2 = binnedData.binnedData(res, genome_db2)
BD2.simpleLoad(heatmap_filepath2, 'heatmap')

q1=BD1.dataDict['heatmap']
q2=BD2.dataDict['heatmap']


#-----DEBUG------
#print "Plotting contact matrix"
#plotting.plot_matrix(np.log(q2))
#plt.subplots_adjust(bottom=0.15)
#print "Saving figure "+figure_path+'tmp.png'

Exemple #22

0

Afficher le fichier

def filter_bychr_heatmap(hm_file):

    resolution = extractResolutionFromFileName(hm_file)
    if resolution == None:
        raise
    from hiclib import binnedData
    # Create a  object, load the data.
    print "creating an object"
    hmap = binnedData.binnedData(resolution, genome_db)

    print "loading data"
    hmap.simpleLoad(hm_file, "heatmap")

    print "saving pict of heatmap"
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from mirnylib import plotting

    maxlen = min(10000, len(hmap.dataDict["heatmap"]))

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage1.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Remove the contacts between loci located within the same bin +/- 1 bin.
    hmap.removeDiagonal(m=1)

    hmap.removeBySequencedCount(
    )  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

    hmap.removePoorRegions(
        cutoff=0.5, coverage=True
    )  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
    # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins.

    hmap.truncTrans()  # remove PCR blowouts from trans data

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage2.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=200)
    f.close()
    plt.clf()

    hmap.iterativeCorrectWithoutSS(force=True)  #do iterative correction

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage3.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Save the iteratively corrected heatmap.
    hmap.export("heatmap", hm_file + ".IC.hdf5", False)

Exemple #23

0

Afficher le fichier

def plotDiagonalCorrelation():
    "Correlation of diagonal bins - paper figure"
    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(200000, myGenome)
    Tanay.simpleLoad(GM200k, "GM-HindIII")
    Tanay.simpleLoad(GM200kNcoI, "GM-NcoI")
    Tanay.simpleLoad(tcc200k, "TCC")
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    pairs = [("GM-HindIII", "GM-NcoI"), ("GM-HindIII", "TCC"), (
        "GM-NcoI", "TCC")]
    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(cr(
                           numpy.diagonal(Tanay.dataDict[pair[0]], i),
                           numpy.diagonal(Tanay.dataDict[pair[1]], i)
                           )[0])

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(cr(
                            numpy.diagonal(Tanay.dataDict[pair[0]], i),
                            numpy.diagonal(Tanay.dataDict[pair[1]], i)
                            )[0])
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(cr(
                            numpy.diagonal(Tanay.dataDict[pair[0]], i),
                            numpy.diagonal(Tanay.dataDict[pair[1]], i)
                            )[0])

    matplotlib.rcParams['font.sans-serif'] = 'Arial'

    #plt.figure(figsize = (2.3,1.8))
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(10, 3))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5., cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5., cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()

Exemple #24

0

Afficher le fichier

def plot_Contact_drop_depending_on_distance_to_border(
        domains,
        distance=1000000,
        bands_binned=[2, 4, 6, 8, 10],
        colors=["red", "green", "blue", "black", "yellow"]):
    #Contact drop depending on distance to border
    if bstrap:
        print "This function is not designed for bstrap mode"
        print "Skipping function"
        return

    print "Contact drop depending on distance to border"

    raw_heatmap = h5dict.h5dict(hmap, mode='r')
    res = int(raw_heatmap['resolution'])
    print "resolution defined by heatmap: ", res

    BD = binnedData.binnedData(res, genome_db)
    print datetime.datetime.now(), " loading hmap"
    BD.simpleLoad(hmap, 'heatmap')
    data = BD.dataDict["heatmap"]

    distance_binned = distance / res
    result = {}

    for ind, band_binned in enumerate(bands_binned):
        result[band_binned] = {}
        for distance in range(-distance_binned, distance_binned + 1):
            result[band_binned][distance] = {}
            result[band_binned][distance]["left"] = []
            result[band_binned][distance]["right"] = []
            result[band_binned][distance]["center"] = []
            for domain in domains:
                chrm = genome_db.label2idx[domain["chrm"]]
                start = int(round(domain["start"] /
                                  float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["left"].append(data[start,
                                                                      end])

                start = int(round(
                    domain["end"] / float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["right"].append(data[start,
                                                                       end])

                domain_length = domain["end"] - domain["start"]
                assert domain_length > 0
                start = int(
                    round((domain["start"] + domain_length / 2.) /
                          float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["center"].append(data[start,
                                                                        end])

            result[band_binned][distance]["left"] = np.average(
                result[band_binned][distance]["left"])
            result[band_binned][distance]["right"] = np.average(
                result[band_binned][distance]["right"])
            result[band_binned][distance]["center"] = np.average(
                result[band_binned][distance]["center"])

    print datetime.datetime.now(), " Saving pictures"
    for pos in ["left", "right", "center"]:
        for ind, band_binned in enumerate(bands_binned):
            X = [x * res for x in sorted(result[band_binned].keys())]
            Y = [
                result[band_binned][x][pos]
                for x in sorted(result[band_binned].keys())
            ]
            plt.plot(X,
                     Y,
                     label="band=" + str(band_binned * res),
                     color=colors[ind],
                     marker="o")
        plt.ylim(ymin=0, ymax=250)
        plt.legend(fontsize="xx-small")
        plt.savefig(hmap + "_" + domains_file.split("/")[-1] +
                    ".contact_drop_on_domains_border_" + pos + ".png",
                    dpi=300)
        plt.clf()

    print datetime.datetime.now(), " Done"

Exemple #25

0

Afficher le fichier

Fichier : correctLowResolutionData.py Projet : bxlab/HiFive_Paper

fnames = ["fname1","fname2"]
names = ["dataset 1", "dataset 2"]
exportnames = ["fname1_corrected","fname2_corrected"] 
resolution = 500000
genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class"

#for one file it would be 
fnames = ["myfile.hm"]
resolution = 500000 
names = ["whatever"]
exportnames = ["filename_corrected"]
genFolder = "genomeFolder"


a = binnedData.binnedData(resolution,genFolder)    #folder should be openable by mirnylib.genome

for name,fname,exportname in zip(names,fnames,exportnames):
    a.simpleLoad(fname, name)

a.removeDiagonal()   #we never ever use diagonal

a.removeBySequencedCount()  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

a.removePoorRegions(cutoff = 0.5, coverage=True)  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
# This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. 

a.removePoorRegions(cutoff = 0.5, coverage=False)  # standart filter. Cutoff reduced to 0.5 from 2. 
a.truncTrans() # remove PCR blowouts from trans data
a.iterativeCorrectWithoutSS()             #do iterative correction 
for name, exportname in names, exportnames:

Exemple #26

0

Afficher le fichier

Fichier : hiclibCorrelate.py Projet : wyim-pgl/ngsane

def plotDiagonalCorrelation(resolution,
                            filename1,
                            filename2,
                            experiment1,
                            experiment2,
                            genome,
                            mouse=False,
                            **kwargs):
    "Correlation of diagonal bins - paper figure"
    global pp

    if (options.verbose):
        print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution, filename1, filename2, experiment1, experiment2, genome)

    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(resolution, genome)
    Tanay.simpleLoad(filename1, experiment1)
    Tanay.simpleLoad(filename2, experiment2)
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()

    pairs = [(experiment1, experiment2)]

    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])

    matplotlib.rcParams['font.sans-serif'] = 'Arial'

    print "Eigenvectors"
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(8, 4))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5., cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5., cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()
    pp.savefig()

Exemple #27

0

Afficher le fichier

#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference
heatmap_filepath1 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename1 + '.hdf5'
heatmap_filepath2 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename2 + '.hdf5'

out_heatmap_filepath2 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000
) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.hdf5'
figure_path = base_folder + 'heatmap-res-' + str(
    domain_res / 1000
) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.png'

print "Loading file " + heatmap_filepath1
BD1 = binnedData.binnedData(domain_res, genome_db1)
BD1.simpleLoad(heatmap_filepath1, 'HindIII_GM_1')

print "Loading file " + heatmap_filepath2
BD2 = binnedData.binnedData(domain_res, genome_db2)
BD2.simpleLoad(heatmap_filepath2, 'HindIII_GM_1')

q1 = BD1.dataDict['HindIII_GM_1']
q2 = BD2.dataDict['HindIII_GM_1']

#-----DEBUG------
#print "Plotting contact matrix"
#plotting.plot_matrix(np.log(q2))
#plt.subplots_adjust(bottom=0.15)
#print "Saving figure "+figure_path+'tmp.png'
#f = open(figure_path+'tmp.png', "wb")

Exemple #28

0

Afficher le fichier

from hiclib import binnedData

fnames = ["fname1", "fname2"]
names = ["dataset 1", "dataset 2"]
exportnames = ["fname1_corrected", "fname2_corrected"]
resolution = 500000
genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class"

#for one file it would be
fnames = ["myfile.hm"]
resolution = 500000
names = ["whatever"]
exportnames = ["filename_corrected"]
genFolder = "genomeFolder"

a = binnedData.binnedData(
    resolution, genFolder)  #folder should be openable by mirnylib.genome

for name, fname, exportname in zip(names, fnames, exportnames):
    a.simpleLoad(fname, name)

a.removeDiagonal()  #we never ever use diagonal

a.removeBySequencedCount(
)  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

a.removePoorRegions(
    cutoff=0.5, coverage=True
)  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
# This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins.

a.removePoorRegions(

Exemple #29

0

Afficher le fichier

Fichier : 12_eigenvectorAnalysis.001.py Projet : labdevgen/FishHiC

genome_db_chrmLevel = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    readChrms=[],
    chrmFileTemplate="%s.fna")

hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-100k.hm"

f_out_path = hm_file + '.eig'

NumEigenvectors = 1  # number of eigenvectors to compute

# Read resolution from one of the datasets
resolution = extractResolutionFromFileName(hm_file)

# Define the binnedData object, load data
BD = binnedData(resolution, genome_db_chrmLevel)
BD.simpleLoad(hm_file, 'heatmap')

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# We'll do iterative correction and Eigenvector expansion on trans data only!
# We want to remove cis, because later we want to remove poor regions in trans
BD.removeCis()

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
# Do this before removing poor regions, because single blowouts may give
# lots of contacts to a region which does not have much contacts otehrwise.
BD.truncTrans(high=0.0005)

Exemple #30

0

Afficher le fichier

Fichier : iterativeCorrection.py Projet : bxlab/HiFive_Paper

    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage
BD.removePoorRegions(cutoff=1)

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
BD.truncTrans(high=0.0005)

# Actually performe iterative correction

Exemple #31

0

Afficher le fichier

def get_by_chr_E1(genome_db, resolution):
    if heatmap_filepath.endswith(".IC"):
        raw = heatmap_filepath[:-3]
    else:
        raw = heatmap_filepath

    print "Using raw heatmap ", raw
    global BD_raw
    BD_raw = binnedData.binnedData(resolution, genome_db)
    BD_raw.simpleLoad(raw, 'heatmap')
    BD_raw.removeDiagonal()

    # Remove bins with less than half of a bin sequenced
    BD_raw.removeBySequencedCount(0.5)
    # We'll do iterative correction and Eigenvector expansion on trans data only!
    # We want to remove cis, because later we want to remove poor regions in trans
    BD_raw.removeCis()
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
    # Do this before removing poor regions, because single blowouts may give
    # lots of contacts to a region which does not have much contacts otehrwise.
    BD_raw.truncTrans(high=0.0005)
    # Remove 1% of regions with low coverage
    BD_raw.removePoorRegions(cutoff=1)
    # Fake cis counts. Data gets iteratively corrected during this process...
    BD_raw.fakeCis()
    # Remove bins with zero counts for eigenvector analysis --> This will be done for each chromosome in for loop
    #	BD.removeZeros()

    # Perform eigenvector expansion.

    result = {"OE": {}, "Classic": {}, "genome_wide_Classic": {}}
    genom_wide_E1 = np.genfromtxt(raw + ".eig", dtype=None)['f2']
    for chrom in range(genome_db.chrmCount):
        st = genome_db.chrmStartsBinCont[chrom]
        end = genome_db.chrmEndsBinCont[chrom]
        cur = BD_raw.dataDict['heatmap'][st:end, st:end]
        mask = np.sum(cur, axis=0) > 0
        if sum(mask) > 5:
            cur = cur[mask]
            cur = cur[:, mask]
            currentEIG, eigenvalues = EIG(cur, numPCs=1)
            if spearmanr(currentEIG[0],
                         BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                currentEIG[0] = -currentEIG[0]
            E1 = np.empty(shape=(len(mask), )) * np.nan
            E1[mask] = currentEIG[0]
            result["Classic"][chrom] = E1

            cur = observedOverExpected(cur)
            mask = np.sum(cur, axis=0) > 0
            if sum(mask) > 5:
                cur = cur[mask]
                cur = cur[:, mask]
                currentEIG, eigenvalues = EIG(cur, numPCs=1)
                if spearmanr(currentEIG[0],
                             BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                    currentEIG[0] = -currentEIG[0]
                E1 = np.empty(shape=(len(mask), )) * np.nan
                E1[mask] = currentEIG[0]
                result["OE"][chrom] = E1

        result["genome_wide_Classic"][chrom] = genom_wide_E1[st:end]
    return result

Exemple #32

0

Afficher le fichier

Fichier : prepGenomeLinkMatrix.py Projet : labdevgen/FishHiC

###########################

#1. Parse contig_names_to_id_file
LACHES_index_converter = {}
with open(contig_names_to_id_file) as f:
    for line in f:
        line = line.strip().split()
        LACHES_index_converter[line[1]] = line[0]

out_file = open(out_file, "w")
out_file.write(header_string)

raw_heatmap = h5dict.h5dict(basefolder + filename, mode='r')
resolution = int(raw_heatmap['resolution'])
BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad(basefolder + filename, 'HindIII')
q = BD.dataDict['HindIII']
interchr_contacts = np.zeros(shape=(genome_db.chrmCount, genome_db.chrmCount))

############################
#genome_db.chrmCount=100
############################

zero_number_of_contacts, nonzero_number_of_contacts = [], []

for chr1 in xrange(genome_db.chrmCount):
    for chr2 in xrange(chr1 + 1, genome_db.chrmCount):
        Ncontacts = q[
            genome_db.chrmStartsBinCont[chr1]:genome_db.chrmEndsBinCont[chr1],
            genome_db.chrmStartsBinCont[chr2]:genome_db.chrmEndsBinCont[chr2]]

Exemple #33

0

Afficher le fichier

def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        outfile.write("\t%s" % BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()