Exemple #1
0
def plotCrossValidation():
    "main figure subplot with corss-validation"
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    plt.figure(figsize=(1, 1))
    FG = HiCdataset(workingFile1, myGenome)
    FG.load(GMFrag)

    Tanay = binnedData(1000000)
    Tanay.simpleLoad("GM-all-10p", "GM-1")
        #need to create these datasets using fragment-level analysis
    Tanay.simpleLoad("GM-all-90p", "GM-9")
    Tanay.removePoorRegions()
    Tanay.iterativeCorrectWithSS()
    Tanay.removeZeros()
    b1, b2 = (Tanay.biasDict["GM-1"], Tanay.biasDict["GM-9"])
    cPickle.dump((b1, b2), open("CrossValidatioN", 'wb'))
    ax = plt.gca()
    b1, b2 = cPickle.load(open("CrossValidatioN", 'rb'))
    print cr(b1, b2)
    plt.scatter(b1, b2, s=.7, color="k", linewidth=0)
    plt.xlabel(r"10% reads", fontsize=8)
    plt.ylabel(r"90% reads", fontsize=8)
    plt.xlim((0, 1.5))
    plt.ylim((0, 1.5))
    plt.xticks([0, 0.5, 1, 1.5])
    plt.yticks([0, 0.5, 1, 1.5])
    removeAxes(shift=0)
    fs = 6
    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(fs)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(fs)
    plt.show()
Exemple #2
0
def CreateMatrixFile():

    BD = binnedData.binnedData(domain_res, genome_db)
    BD.simpleLoad(heatmap_filepath, 'heatmap')
    print "Writing file %s with the information \n" % (heatmap_filepath +
                                                       '.matrix')
    print "Format:\nChrIndex \t StartBin(Nucleotyde) \t EndBin(Nucl) \t Values\n"
    f = open(heatmap_filepath + '.matrix', 'w')
    for i in range(len(BD.dataDict['heatmap'])):
        strToWrite = ""
        curChrmIdx = genome_db.chrmIdxBinCont[i]
        if curChrmIdx == 0:
            curRelativeBinNumb = i
        else:
            curRelativeBinNumb = i - genome_db.chrmLensBin[0:curChrmIdx].sum()

        strToWrite += str(curChrmIdx) + "\t" + str(
            genome_db.posBinCont[i]) + "\t" + str(
                genome_db.posBinCont[i] +
                genome_db.binSizesBp[curChrmIdx][curRelativeBinNumb])
        for j in range(len(BD.dataDict['heatmap'])):
            strToWrite += "\t" + str(BD.dataDict['heatmap'][i][j])
        strToWrite += "\n"
        f.write(strToWrite)
    f.close()
Exemple #3
0
def calculateTanayCorrelation(
    resolution, filename1, filename2, experiment1, experiment2, genome, outfile, mouse=False, **kwargs
):
    "Calculates correlation between datasets, smoothed in a Tanay way"

    global pp
    if options.verbose:
        print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution,
            filename1,
            filename2,
            experiment1,
            experiment2,
            genome,
        )

    BD = binnedData(resolution, genome)
    BD.simpleLoad(filename1, experiment1)
    BD.simpleLoad(filename2, experiment2)

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1.0 + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix, weights=mat, mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict[experiment1]
    data2 = BD.dataDict[experiment2]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask])
    outfile.write("Spearman corrleation	%s	%s %.4f	%.4f" % (filename1, filename2, scorr, pvalue))
Exemple #4
0
def iterativeFiltering(genome_db, fragments):
	'''
	Filter the data at the binned level and perform the iterative correction.
	'''
	
	# Read resolution from the dataset.
	raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') 
	resolution = int(raw_heatmap['resolution'])
	
	# Create a binnedData object, load the data.
	BD = binnedData.binnedData(resolution, genome_db)
	BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment)

	# Remove the contacts between loci located within the same bin.
	BD.removeDiagonal()
	
	# Remove bins with less than half of a bin sequenced.
	BD.removeBySequencedCount(0.5)
	
	# Remove 1% of regions with low coverage.
	BD.removePoorRegions(cutoff=1)
	
	# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
	BD.truncTrans(high=0.0005)
	
	# Perform iterative correction.
	BD.iterativeCorrectWithoutSS()

	# Save the iteratively corrected heatmap.
	BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5')

	plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
Exemple #5
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') 
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        outfile.write("\t%s"%BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()
def get_chromosomes(hm_file, genome_db, resolution, chrNumb=None):
    if extractResolutionFromFileName(hm_file) != resolution:
        print "WARNING! Provided resolution ", resolution, "does not match ", extractResolutionFromFileName(
            hm_file), "extracted from file name ", hm_file
    if "hiRes.hm" in hm_file:
        type = "HiRes"
    elif "bychr.hm" in hm_file:
        type = "bychr"
    else:
        print "Warning: cannot resolve type of data from filename"
        try:
            print "Warning: trying hires hic"
            raw_heatmap = h5dict.h5dict(fname, mode='r')  #open heatmap
            if "0 0" in raw_heatmap.keys():
                type = "HiRes"
            else:
                print "HiRes hic Failed! Assuming bychr type"
                type = "bychr"
        except:
            print "HiRes hic Failed! Assuming bychr type"
            type = "bychr"
    if type == "HiRes":
        from hiclib import highResBinnedData
        # Create a  object, load the data.
        print "creating an object"
        hmap = highResBinnedData.HiResHiC(genome_db, resolution)
        print "loading data"
        hmap.loadData(hm_file, mode="cis")
        print "Data loaded"
        if chrNumb != None:
            return hmap.data[(chrNumb, chrNumb)].getData()
        return [
            hmap.data[(i, i)].getData() for i in xrange(genome_db.chrmCount)
        ]
        #cisKeys are tuples like (N,N) where N is 0..Number_of_chrms-1
    elif type == "bychr":
        from hiclib import binnedData
        print "creating an object"
        hmap = binnedData.binnedData(resolution, genome_db)

        print "loading data"
        hmap.simpleLoad(hm_file, "heatmap")
        data = hmap.dataDict["heatmap"]
        assert len(data) == genome_db.numBins
        print "Data loaded"
        if chrNumb != None:
            return data[genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb],
                        genome_db.chrmStartsBinCont[chrNumb]:genome_db.
                        chrmEndsBinCont[chrNumb]]
        return [
            data[genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i],
                 genome_db.chrmStartsBinCont[i]:genome_db.chrmEndsBinCont[i]]
            for i in xrange(genome_db.chrmCount)
        ]
    else:
        raise "Error: can not recognize heatmap format from file name"
Exemple #7
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()
Exemple #8
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r')  
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')
    
    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True) ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000))
    plt.clf()

    outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        for eigenvector in eig_v:
            outfile.write("\t%s"%eigenvector[i])
        outfile.write("\n")
    outfile.close()
Exemple #9
0
def calculateTanayCorrelation():
    "Calculates correlation between datasets, smoothed in a Tanay way"
    BD = binnedData(1000000, "../../../data/hg18")
    BD.simpleLoad("../../../ErezPaperData/hg18/GM-HindIII-hg18-1M.hm",
                  "HindIII")
    BD.simpleLoad("../../../ErezPaperData/hg18/GM-NcoI-hg18-1M.hm", "NcoI")

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix,
                                              weights=mat,
                                              mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict["HindIII"]
    data2 = BD.dataDict["NcoI"]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    print scipy.stats.spearmanr(d1[cormask], d2[cormask])
Exemple #10
0
def saddlePlot():
    "plot of values ordered by Eig1GW"

    #plt.figure(figsize = (1.5,1.5))
    plt.figure(figsize=(3, 3))
    Tanay = binnedData(1000000)
    Tanay.simpleLoad("../data/GM-all-hg18-1M", "GM-all")
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    Tanay.fakeCis()
    Tanay.iterativeCorrectWithoutSS()
    Tanay.doEig()
    PC = Tanay.EIG["GM-all"][:, 0]
    if PC[0] > 0:
        PC = -PC

    def reorder(data, array=PC):
        inds = numpy.argsort(array)
        ndata = data[inds, :]
        return ndata[:, inds]
    toplot = (coarsegrain(reorder(Tanay.dataDict["GM-all"]), 60))
    toplot /= toplot.mean()
    toplot = numpy.log(toplot)
    sh = toplot.shape
    toplot = toplot.reshape((-1))
    ag = numpy.argmax(toplot)
    toplot[ag] = 0
    toplot[ag] = numpy.max(toplot)
    toplot.shape = sh
    toplot[0, -1] = toplot[0, -2]
    toplot[-1, 0] = toplot[-2, 0]
    plt.imshow(toplot, vmin=toplot.min(), vmax=toplot.max(),
               interpolation="nearest")
    cbar = plt.colorbar(orientation="vertical")
    #labels = ["10","100","1000","10000"]
    #cbar.ax.set_xticklabels(labels)
    cbar.ax.set_xlabel("Log(relative contact probability)", fontsize=6)
    for xlabel_i in cbar.ax.get_xticklabels():
        xlabel_i.set_fontsize(6)
    cbar.set_ticks([-0.5, 0, 0.5, 1])
    removeBorder()
    mirnylib.plotting.niceShow()
Exemple #11
0
def iterativeFiltering(genome_db, filesuffix):
    '''
	Filter the data at the binned level and perform the iterative correction.
	'''

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment +
                                filesuffix,
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(options.outputDir + options.experiment + filesuffix,
                  options.experiment)

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Remove empty bins
    BD.removeZeros()

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export(options.experiment,
              options.outputDir + options.experiment + '-IC' + filesuffix)

    plt.figure()
    plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
    pp.savefig()
Exemple #12
0
def doEigenvector(filename, genome):
    if filename == "GC":
        gen = Genome("/home/magus/HiC2011/data/" + genome, readChrms=["#","X"])
        gen.setResolution(1000000)
        GC = np.concatenate(gen.GCBin)
        return GC
    resolution = getResolution(filename)
    BD = binnedData.binnedData(resolution, "/home/magus/HiC2011/data/" + genome, ["#","X"])

    BD.simpleLoad(filename, "bla")
    BD.removeDiagonal()

    BD.removeBySequencedCount(0.5)

    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=2)
    BD.restoreZeros(value=0)
    return BD.EigDict["bla"][0]
Exemple #13
0
def CorrectHeatMap():
    # Read resolution from the dataset.
    print "Loading raw heatmap\n"
    raw_heatmap = h5dict.h5dict(heatmap_filepath + '-raw', mode='r')
    resolution = int(raw_heatmap['resolution'])

    ####### Set resolution for genome
    #genome_db.setResolution(resolution)

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(heatmap_filepath + '-raw', 'HindIII_GM_1')
    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()
    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)
    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)
    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()
    # Save the iteratively corrected heatmap.
    BD.export('HindIII_GM_1', heatmap_filepath)
    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage
BD.removePoorRegions(cutoff=1)

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
BD.truncTrans(high=0.0005)

# Actually performe iterative correction
source = os.path.join(HiCFolder,
                      sys.argv[1])  # file after calling runHiC binning
outfile = sys.argv[2]

# params from source
lib = h5dict(source, 'r')
res = lib['resolution']
gInfo = lib['genomeInformation']
genomeFolder = os.path.join(gInfo['dataFolder'], gInfo['genomeName'])

# Load binned data and perform PCA ...
genome_db = myGenome(genomeFolder,
                     readChrms=gInfo['chroms'],
                     chrmFileTemplate=gInfo['template'],
                     gapFile=gInfo['gapFile'])
BD = binnedData.binnedData(res, genome_db)
name = os.path.split(source)[1].split('-')[0]
BD.simpleLoad(source, name)
BD.doCisPCADomains(3)
# Identify compartments ...
pcas = BD.PCDict[name].T
idx2label = gInfo['idx2label']
for i in idx2label:
    label = idx2label[i]
    mask = BD.chromosomeIndex == i
    tmp = pcas[mask]
    compartments = compartmentFromPCA(tmp[:, 0], res, label)
    with open(outfile, 'a') as output:
        for c in compartments:
            line = '\t'.join(map(str, c)) + '\n'
            output.write(line)
Exemple #16
0
base_folder = '/mnt/storage/home/vsfishman/HiC/data/'
base_filename = 'ESC_full'

heatmap_filepath = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename + '.hdf5'
raw_heatmap_filepath = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename + '.hdf5'
maped_reads_filepath = base_folder + 'mapped_reads_' + base_filename + '.hdf5'
figure_path = base_folder + base_filename + "_" + str(
    domain_res / 1000) + 'kb-Xist.png'

genome_fai_filepath = '../../fasta/' + genome_name + '/' + genome_name + '.fai'

print "Loading file " + heatmap_filepath
BD = binnedData.binnedData(domain_res, genome_db)
BD.simpleLoad(heatmap_filepath, 'HindIII_GM_1')
BD_raw = binnedData.binnedData(domain_res, genome_db)
BD_raw.simpleLoad(heatmap_filepath, 'HindIII_GM_1')

q = BD.dataDict['HindIII_GM_1']
q_raw = BD_raw.dataDict['HindIII_GM_1']

X_values = []
Y_values = []
Y_errors = []

binnumber = 100
dist = -1

start = sum(genome_db.chrmLensBin[0:19])
Exemple #17
0
def plotDiagonalCorrelation(resolution, filename1, filename2, experiment1, experiment2, genome, mouse=False, **kwargs):
    "Correlation of diagonal bins - paper figure"
    global pp

    if options.verbose:
        print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution,
            filename1,
            filename2,
            experiment1,
            experiment2,
            genome,
        )

    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(resolution, genome)
    Tanay.simpleLoad(filename1, experiment1)
    Tanay.simpleLoad(filename2, experiment2)
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()

    pairs = [(experiment1, experiment2)]

    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i), numpy.diagonal(Tanay.dataDict[pair[1]], i))[0]
            )

    matplotlib.rcParams["font.sans-serif"] = "Arial"

    print "Eigenvectors"
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(8, 4))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5.0, cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5.0, cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5.0, cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()
    pp.savefig()
Exemple #18
0
from mirnylib.genome import Genome
from hiclib.binnedData import binnedData
from mirnylib.h5dict import h5dict
import numpy as np
import sys
import os

genome = Genome(sys.argv[1], readChrms=["1", "2", "3", "4", "5"])

a = HiResHiC(genome, 1000000, "hiResDict", mode='w')
a.loadData(dictLike="../fragmentHiC/test-1M-byChr.hm")
a.removeDiagonal()
a.removePoorRegions(2)
a.iterativeCorrection(1e-10)

b = binnedData(1000000, genome)

data = {"heatmap": h5dict("../fragmentHiC/test-1M.hm")["heatmap"]}
lim = b.genome.chrmEndsBinCont[-1]
data["heatmap"] = data["heatmap"][:lim, :lim]

b.simpleLoad(data, "data")
b.removeDiagonal()
b.removePoorRegions(cutoff=2)
b.iterativeCorrectWithoutSS(tolerance=1e-10)
a.export("testExport")

def compareData():
    dataHigh = a.getCombinedMatrix()
    dataLow = b.dataDict["data"]
Exemple #19
0
diags={}

for i in filenames.keys():  
	print "Reading file "+i
#	if (i.split('.')[-1]=='hdf5'):
	if True:
		if (resolution==0): #if we do not know resolution
			raw_heatmap = h5dict.h5dict(i, mode='r') #open heatmap
			resolution = int(raw_heatmap['resolution']) #get the resolution
			del raw_heatmap #close heatmap
		if (genome_db==None): #if we have not initilaize genome before
			genome_db = genome.Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
				readChrms=[],
				chrmFileTemplate="N%s.fa")
		BD = binnedData.binnedData(resolution, genome_db) #now we can initialyze heatmap with defined resolution and genome
		BD.simpleLoad(i, 'heatmap')
		number_of_bins=len(BD.dataDict['heatmap'])
		
		diags[i]=np.zeros(max(genome_db.chrmLensBin))
		for chr in xrange(genome_db.chrmCount):
			for j in xrange(genome_db.chrmLensBin[chr]):
				cur_chr_matrix = BD.dataDict['heatmap'][genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr],genome_db.chrmStartsBinCont[chr]:genome_db.chrmEndsBinCont[chr]]
				diags[i][j] += sum(np.diag(cur_chr_matrix,j))*2
		print np.sum(diags[i])
		diags[i] = (diags[i]/np.sum(diags[i]))*100.0
		print (diags[i][0:10])/100.0
		del BD		


def contact_freq_total():
Exemple #20
0
def calculateTanayCorrelation(resolution,
                              filename1,
                              filename2,
                              experiment1,
                              experiment2,
                              genome,
                              outfile,
                              mouse=False,
                              **kwargs):
    "Calculates correlation between datasets, smoothed in a Tanay way"

    global pp
    if (options.verbose):
        print >> sys.stdout, "calculateTanayCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution, filename1, filename2, experiment1, experiment2, genome)

    BD = binnedData(resolution, genome)
    BD.simpleLoad(filename1, experiment1)
    BD.simpleLoad(filename2, experiment2)

    def tanaySmooth(matrix):
        matrix = numpy.array(matrix, dtype="double")
        a = numpy.arange(-9, 10)
        mat = 1 / (1. + numpy.abs(a[:, None]) + numpy.abs(a[None, :]))
        return scipy.ndimage.filters.convolve(input=matrix,
                                              weights=mat,
                                              mode="constant")

    def propagateSmooth(data):
        mask1 = numpy.sum(data, axis=0) > 0
        mask = mask1[:, None] * mask1[None, :]
        ret = numpy.zeros_like(data, dtype=float)
        for i in xrange(BD.genome.chrmCount):
            for j in xrange(BD.genome.chrmCount):
                beg1 = BD.chromosomeStarts[i]
                beg2 = BD.chromosomeStarts[j]
                end1 = BD.chromosomeEnds[i]
                end2 = BD.chromosomeEnds[j]
                mymask = mask[beg1:end1, beg2:end2]
                d = data[beg1:end1, beg2:end2]
                toret = tanaySmooth(d) / tanaySmooth(mymask)
                toret[mymask == 0] = 0
                ret[beg1:end1, beg2:end2] = toret
        return ret

    BD.removePoorRegions(cutoff=2)

    BD.removeCis()

    BD.iterativeCorrectWithoutSS()
    data1 = BD.dataDict[experiment1]
    data2 = BD.dataDict[experiment2]

    mask = (numpy.sum(data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
    validMask = mask[:, None] * mask[None, :]
    transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
    cormask = transmask * validMask

    d1 = propagateSmooth(data1)
    d2 = propagateSmooth(data2)
    (scorr, pvalue) = scipy.stats.spearmanr(d1[cormask], d2[cormask])
    outfile.write("Spearman corrleation	%s	%s %.4f	%.4f" %
                  (filename1, filename2, scorr, pvalue))
Exemple #21
0


#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference
heatmap_filepath1=sys.argv[1]
heatmap_filepath2=sys.argv[2]

#out_heatmap_filepath2 = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.hdf5'
#figure_path = base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename2+'_compressed_as_'+base_filename1+'.png'

print "Loading file "+heatmap_filepath1
raw_heatmap = h5dict.h5dict(heatmap_filepath1, mode='r') 
res = int(raw_heatmap['resolution'])
print "resolution defined by heatmap: ",res

BD1 = binnedData.binnedData(res, genome_db1)
BD1.simpleLoad(heatmap_filepath1, 'heatmap')

print "Loading file "+heatmap_filepath2
BD2 = binnedData.binnedData(res, genome_db2)
BD2.simpleLoad(heatmap_filepath2, 'heatmap')

q1=BD1.dataDict['heatmap']
q2=BD2.dataDict['heatmap']


#-----DEBUG------
#print "Plotting contact matrix"
#plotting.plot_matrix(np.log(q2))
#plt.subplots_adjust(bottom=0.15)
#print "Saving figure "+figure_path+'tmp.png'
Exemple #22
0
def filter_bychr_heatmap(hm_file):

    resolution = extractResolutionFromFileName(hm_file)
    if resolution == None:
        raise
    from hiclib import binnedData
    # Create a  object, load the data.
    print "creating an object"
    hmap = binnedData.binnedData(resolution, genome_db)

    print "loading data"
    hmap.simpleLoad(hm_file, "heatmap")

    print "saving pict of heatmap"
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from mirnylib import plotting

    maxlen = min(10000, len(hmap.dataDict["heatmap"]))

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage1.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Remove the contacts between loci located within the same bin +/- 1 bin.
    hmap.removeDiagonal(m=1)

    hmap.removeBySequencedCount(
    )  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

    hmap.removePoorRegions(
        cutoff=0.5, coverage=True
    )  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
    # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins.

    hmap.truncTrans()  # remove PCR blowouts from trans data

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage2.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=200)
    f.close()
    plt.clf()

    hmap.iterativeCorrectWithoutSS(force=True)  #do iterative correction

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage3.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Save the iteratively corrected heatmap.
    hmap.export("heatmap", hm_file + ".IC.hdf5", False)
Exemple #23
0
def plotDiagonalCorrelation():
    "Correlation of diagonal bins - paper figure"
    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(200000, myGenome)
    Tanay.simpleLoad(GM200k, "GM-HindIII")
    Tanay.simpleLoad(GM200kNcoI, "GM-NcoI")
    Tanay.simpleLoad(tcc200k, "TCC")
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()
    pairs = [("GM-HindIII", "GM-NcoI"), ("GM-HindIII", "TCC"), (
        "GM-NcoI", "TCC")]
    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(cr(
                           numpy.diagonal(Tanay.dataDict[pair[0]], i),
                           numpy.diagonal(Tanay.dataDict[pair[1]], i)
                           )[0])

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(cr(
                            numpy.diagonal(Tanay.dataDict[pair[0]], i),
                            numpy.diagonal(Tanay.dataDict[pair[1]], i)
                            )[0])
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(cr(
                            numpy.diagonal(Tanay.dataDict[pair[0]], i),
                            numpy.diagonal(Tanay.dataDict[pair[1]], i)
                            )[0])

    matplotlib.rcParams['font.sans-serif'] = 'Arial'

    #plt.figure(figsize = (2.3,1.8))
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(10, 3))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5., cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5., cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()
Exemple #24
0
def plot_Contact_drop_depending_on_distance_to_border(
        domains,
        distance=1000000,
        bands_binned=[2, 4, 6, 8, 10],
        colors=["red", "green", "blue", "black", "yellow"]):
    #Contact drop depending on distance to border
    if bstrap:
        print "This function is not designed for bstrap mode"
        print "Skipping function"
        return

    print "Contact drop depending on distance to border"

    raw_heatmap = h5dict.h5dict(hmap, mode='r')
    res = int(raw_heatmap['resolution'])
    print "resolution defined by heatmap: ", res

    BD = binnedData.binnedData(res, genome_db)
    print datetime.datetime.now(), " loading hmap"
    BD.simpleLoad(hmap, 'heatmap')
    data = BD.dataDict["heatmap"]

    distance_binned = distance / res
    result = {}

    for ind, band_binned in enumerate(bands_binned):
        result[band_binned] = {}
        for distance in range(-distance_binned, distance_binned + 1):
            result[band_binned][distance] = {}
            result[band_binned][distance]["left"] = []
            result[band_binned][distance]["right"] = []
            result[band_binned][distance]["center"] = []
            for domain in domains:
                chrm = genome_db.label2idx[domain["chrm"]]
                start = int(round(domain["start"] /
                                  float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["left"].append(data[start,
                                                                      end])

                start = int(round(
                    domain["end"] / float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["right"].append(data[start,
                                                                       end])

                domain_length = domain["end"] - domain["start"]
                assert domain_length > 0
                start = int(
                    round((domain["start"] + domain_length / 2.) /
                          float(res))) + distance - (band_binned / 2)
                if (start >= 0
                    ) and (start + band_binned) < genome_db.chrmLensBin[chrm]:
                    start = sum(genome_db.chrmLensBin[0:chrm]) + start
                    end = start + band_binned
                    result[band_binned][distance]["center"].append(data[start,
                                                                        end])

            result[band_binned][distance]["left"] = np.average(
                result[band_binned][distance]["left"])
            result[band_binned][distance]["right"] = np.average(
                result[band_binned][distance]["right"])
            result[band_binned][distance]["center"] = np.average(
                result[band_binned][distance]["center"])

    print datetime.datetime.now(), " Saving pictures"
    for pos in ["left", "right", "center"]:
        for ind, band_binned in enumerate(bands_binned):
            X = [x * res for x in sorted(result[band_binned].keys())]
            Y = [
                result[band_binned][x][pos]
                for x in sorted(result[band_binned].keys())
            ]
            plt.plot(X,
                     Y,
                     label="band=" + str(band_binned * res),
                     color=colors[ind],
                     marker="o")
        plt.ylim(ymin=0, ymax=250)
        plt.legend(fontsize="xx-small")
        plt.savefig(hmap + "_" + domains_file.split("/")[-1] +
                    ".contact_drop_on_domains_border_" + pos + ".png",
                    dpi=300)
        plt.clf()

    print datetime.datetime.now(), " Done"
fnames = ["fname1","fname2"]
names = ["dataset 1", "dataset 2"]
exportnames = ["fname1_corrected","fname2_corrected"] 
resolution = 500000
genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class"

#for one file it would be 
fnames = ["myfile.hm"]
resolution = 500000 
names = ["whatever"]
exportnames = ["filename_corrected"]
genFolder = "genomeFolder"


a = binnedData.binnedData(resolution,genFolder)    #folder should be openable by mirnylib.genome

for name,fname,exportname in zip(names,fnames,exportnames):
    a.simpleLoad(fname, name)

a.removeDiagonal()   #we never ever use diagonal

a.removeBySequencedCount()  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

a.removePoorRegions(cutoff = 0.5, coverage=True)  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
# This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. 

a.removePoorRegions(cutoff = 0.5, coverage=False)  # standart filter. Cutoff reduced to 0.5 from 2. 
a.truncTrans() # remove PCR blowouts from trans data
a.iterativeCorrectWithoutSS()             #do iterative correction 
for name, exportname in names, exportnames: 
Exemple #26
0
def plotDiagonalCorrelation(resolution,
                            filename1,
                            filename2,
                            experiment1,
                            experiment2,
                            genome,
                            mouse=False,
                            **kwargs):
    "Correlation of diagonal bins - paper figure"
    global pp

    if (options.verbose):
        print >> sys.stdout, "plotDiagonalCorrelation: res: %d file1: %s file2: %s exp1:%s exp2:%s gen:%s" % (
            resolution, filename1, filename2, experiment1, experiment2, genome)

    S = 50
    x = numpy.arange(2, S)
    Tanay = binnedData(resolution, genome)
    Tanay.simpleLoad(filename1, experiment1)
    Tanay.simpleLoad(filename2, experiment2)
    Tanay.removeDiagonal(1)
    Tanay.removePoorRegions()
    Tanay.removeZeros()

    pairs = [(experiment1, experiment2)]

    cors = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])

    Tanay.iterativeCorrectWithoutSS(M=1)
    cors2 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors2[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])
    Tanay.iterativeCorrectWithoutSS(M=20)
    cors3 = [[] for _ in pairs]
    for i in x:
        for j, pair in enumerate(pairs):
            cors3[j].append(
                cr(numpy.diagonal(Tanay.dataDict[pair[0]], i),
                   numpy.diagonal(Tanay.dataDict[pair[1]], i))[0])

    matplotlib.rcParams['font.sans-serif'] = 'Arial'

    print "Eigenvectors"
    print cors
    print cors2
    print cors3
    plt.figure(figsize=(8, 4))
    ax = plt.gca()
    for j, pair in enumerate(pairs):
        plt.subplot(1, len(pairs), j)
        fs = 8
        for xlabel_i in ax.get_xticklabels():
            xlabel_i.set_fontsize(fs)
        for xlabel_i in ax.get_yticklabels():
            xlabel_i.set_fontsize(fs)
        plt.title("%s vs %s" % pair)
        plt.plot(x / 5., cors3[j], color="#E5A826", label="Iterative")
        plt.plot(x / 5., cors2[j], color="#28459A", label="Single")
        plt.plot(x / 5., cors[j], color="#E55726", label="Raw")
        plt.xlabel("Genomic Separation, MB", fontsize=8)
        plt.ylabel("Spearman correlation", fontsize=8)
        plt.legend()

        legend = plt.legend(prop={"size": 6}, loc=9, handlelength=2)
        legend.draw_frame(False)
        plt.ylim((0, 1))
        removeAxes(shift=0)

    plt.show()
    pp.savefig()
Exemple #27
0
#IMPORTANT: use iter-corrected heatmaps here. Otherwise, take care about adjustment of total reads number when calculating mask_hugeDifference
heatmap_filepath1 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename1 + '.hdf5'
heatmap_filepath2 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000) + 'KB_' + base_filename2 + '.hdf5'

out_heatmap_filepath2 = base_folder + 'heatmap-res-' + str(
    domain_res / 1000
) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.hdf5'
figure_path = base_folder + 'heatmap-res-' + str(
    domain_res / 1000
) + 'KB_' + base_filename2 + '_compressed_as_' + base_filename1 + '.png'

print "Loading file " + heatmap_filepath1
BD1 = binnedData.binnedData(domain_res, genome_db1)
BD1.simpleLoad(heatmap_filepath1, 'HindIII_GM_1')

print "Loading file " + heatmap_filepath2
BD2 = binnedData.binnedData(domain_res, genome_db2)
BD2.simpleLoad(heatmap_filepath2, 'HindIII_GM_1')

q1 = BD1.dataDict['HindIII_GM_1']
q2 = BD2.dataDict['HindIII_GM_1']

#-----DEBUG------
#print "Plotting contact matrix"
#plotting.plot_matrix(np.log(q2))
#plt.subplots_adjust(bottom=0.15)
#print "Saving figure "+figure_path+'tmp.png'
#f = open(figure_path+'tmp.png', "wb")
Exemple #28
0
from hiclib import binnedData

fnames = ["fname1", "fname2"]
names = ["dataset 1", "dataset 2"]
exportnames = ["fname1_corrected", "fname2_corrected"]
resolution = 500000
genFolder = "/folder/to/the/genome/files/and/gap.txt/file/according/to/the/mirnylib.genome/class"

#for one file it would be
fnames = ["myfile.hm"]
resolution = 500000
names = ["whatever"]
exportnames = ["filename_corrected"]
genFolder = "genomeFolder"

a = binnedData.binnedData(
    resolution, genFolder)  #folder should be openable by mirnylib.genome

for name, fname, exportname in zip(names, fnames, exportnames):
    a.simpleLoad(fname, name)

a.removeDiagonal()  #we never ever use diagonal

a.removeBySequencedCount(
)  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

a.removePoorRegions(
    cutoff=0.5, coverage=True
)  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
# This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins.

a.removePoorRegions(
genome_db_chrmLevel = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    readChrms=[],
    chrmFileTemplate="%s.fna")

hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-100k.hm"

f_out_path = hm_file + '.eig'

NumEigenvectors = 1  # number of eigenvectors to compute

# Read resolution from one of the datasets
resolution = extractResolutionFromFileName(hm_file)

# Define the binnedData object, load data
BD = binnedData(resolution, genome_db_chrmLevel)
BD.simpleLoad(hm_file, 'heatmap')

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# We'll do iterative correction and Eigenvector expansion on trans data only!
# We want to remove cis, because later we want to remove poor regions in trans
BD.removeCis()

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
# Do this before removing poor regions, because single blowouts may give
# lots of contacts to a region which does not have much contacts otehrwise.
BD.truncTrans(high=0.0005)
    if not os.path.exists(inDataset):
        raise IOError("Raw heatmap file does not exist: {}".format(inDataset))

if not os.path.isdir(genomeFolder):
    raise IOError("Genome folder does not exist")

# When you do this, be sure that readChrms used to save heatmap matches
# readChrms that you define here!
genome = Genome(genomeFolder, readChrms=readChrms)

# Read resolution from one of the datasets
sampleDataset = h5dict(inDatasets.values()[0], mode="r")  # random dataset
resolution = int(sampleDataset["resolution"])

# Define the binnedData object, load data
BD = binnedData(resolution, genome, readChrms)
for name, filename in inDatasets.items():
    BD.simpleLoad(filename, name)

BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage
BD.removePoorRegions(cutoff=1)

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
BD.truncTrans(high=0.0005)

# Actually performe iterative correction
Exemple #31
0
def get_by_chr_E1(genome_db, resolution):
    if heatmap_filepath.endswith(".IC"):
        raw = heatmap_filepath[:-3]
    else:
        raw = heatmap_filepath

    print "Using raw heatmap ", raw
    global BD_raw
    BD_raw = binnedData.binnedData(resolution, genome_db)
    BD_raw.simpleLoad(raw, 'heatmap')
    BD_raw.removeDiagonal()

    # Remove bins with less than half of a bin sequenced
    BD_raw.removeBySequencedCount(0.5)
    # We'll do iterative correction and Eigenvector expansion on trans data only!
    # We want to remove cis, because later we want to remove poor regions in trans
    BD_raw.removeCis()
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
    # Do this before removing poor regions, because single blowouts may give
    # lots of contacts to a region which does not have much contacts otehrwise.
    BD_raw.truncTrans(high=0.0005)
    # Remove 1% of regions with low coverage
    BD_raw.removePoorRegions(cutoff=1)
    # Fake cis counts. Data gets iteratively corrected during this process...
    BD_raw.fakeCis()
    # Remove bins with zero counts for eigenvector analysis --> This will be done for each chromosome in for loop
    #	BD.removeZeros()

    # Perform eigenvector expansion.

    result = {"OE": {}, "Classic": {}, "genome_wide_Classic": {}}
    genom_wide_E1 = np.genfromtxt(raw + ".eig", dtype=None)['f2']
    for chrom in range(genome_db.chrmCount):
        st = genome_db.chrmStartsBinCont[chrom]
        end = genome_db.chrmEndsBinCont[chrom]
        cur = BD_raw.dataDict['heatmap'][st:end, st:end]
        mask = np.sum(cur, axis=0) > 0
        if sum(mask) > 5:
            cur = cur[mask]
            cur = cur[:, mask]
            currentEIG, eigenvalues = EIG(cur, numPCs=1)
            if spearmanr(currentEIG[0],
                         BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                currentEIG[0] = -currentEIG[0]
            E1 = np.empty(shape=(len(mask), )) * np.nan
            E1[mask] = currentEIG[0]
            result["Classic"][chrom] = E1

            cur = observedOverExpected(cur)
            mask = np.sum(cur, axis=0) > 0
            if sum(mask) > 5:
                cur = cur[mask]
                cur = cur[:, mask]
                currentEIG, eigenvalues = EIG(cur, numPCs=1)
                if spearmanr(currentEIG[0],
                             BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                    currentEIG[0] = -currentEIG[0]
                E1 = np.empty(shape=(len(mask), )) * np.nan
                E1[mask] = currentEIG[0]
                result["OE"][chrom] = E1

        result["genome_wide_Classic"][chrom] = genom_wide_E1[st:end]
    return result
###########################

#1. Parse contig_names_to_id_file
LACHES_index_converter = {}
with open(contig_names_to_id_file) as f:
    for line in f:
        line = line.strip().split()
        LACHES_index_converter[line[1]] = line[0]

out_file = open(out_file, "w")
out_file.write(header_string)

raw_heatmap = h5dict.h5dict(basefolder + filename, mode='r')
resolution = int(raw_heatmap['resolution'])
BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad(basefolder + filename, 'HindIII')
q = BD.dataDict['HindIII']
interchr_contacts = np.zeros(shape=(genome_db.chrmCount, genome_db.chrmCount))

############################
#genome_db.chrmCount=100
############################

zero_number_of_contacts, nonzero_number_of_contacts = [], []

for chr1 in xrange(genome_db.chrmCount):
    for chr2 in xrange(chr1 + 1, genome_db.chrmCount):
        Ncontacts = q[
            genome_db.chrmStartsBinCont[chr1]:genome_db.chrmEndsBinCont[chr1],
            genome_db.chrmStartsBinCont[chr2]:genome_db.chrmEndsBinCont[chr2]]
Exemple #33
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        outfile.write("\t%s" % BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()