Ejemplo n.º 1
0
def plotCrossValidation():
    "main figure subplot with corss-validation"
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    plt.figure(figsize=(1, 1))
    FG = HiCdataset(workingFile1, myGenome)
    FG.load(GMFrag)

    Tanay = binnedData(1000000)
    Tanay.simpleLoad("GM-all-10p", "GM-1")
        #need to create these datasets using fragment-level analysis
    Tanay.simpleLoad("GM-all-90p", "GM-9")
    Tanay.removePoorRegions()
    Tanay.iterativeCorrectWithSS()
    Tanay.removeZeros()
    b1, b2 = (Tanay.biasDict["GM-1"], Tanay.biasDict["GM-9"])
    cPickle.dump((b1, b2), open("CrossValidatioN", 'wb'))
    ax = plt.gca()
    b1, b2 = cPickle.load(open("CrossValidatioN", 'rb'))
    print cr(b1, b2)
    plt.scatter(b1, b2, s=.7, color="k", linewidth=0)
    plt.xlabel(r"10% reads", fontsize=8)
    plt.ylabel(r"90% reads", fontsize=8)
    plt.xlim((0, 1.5))
    plt.ylim((0, 1.5))
    plt.xticks([0, 0.5, 1, 1.5])
    plt.yticks([0, 0.5, 1, 1.5])
    removeAxes(shift=0)
    fs = 6
    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(fs)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(fs)
    plt.show()
Ejemplo n.º 2
0
 def getScaling(self):
     HD = HiCdataset(self.refined,
                     self.getGenomeObject(),
                     self.getEnzyme(),
                     1000,
                     mode='r',
                     tmpFolder="\tmp",
                     dictToStoreIDs="h5dict")
     scal = HD.plotScaling(excludeNeighbors=2, normalize=True, mindist=2000)
     return scal
Ejemplo n.º 3
0
def getFrags():
    """
    A method that calculates a set of restriction fragments covered in all hg19, or in all mm9 datasets
     Used for the correct calculation of scalings
    """
    mouse = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/mm9/oocyte_combined_refined.frag"
    human = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/hg19/K562_combined_refined.frag"
    a = HiCdataset("bla", os.path.join("../../data/hg19"), enzymeName=1000, inMemory=True)
    a.load(human)
    humanFrags = a.fragmentSum() > 0

    a = HiCdataset("bla", os.path.join("../../data/mm9"), enzymeName=1000, inMemory=True)
    a.load(mouse)
    mouseFrags = a.fragmentSum() > 0
    return humanFrags, mouseFrags
Ejemplo n.º 4
0
    def getByChromosomeScaling(self):
        HD = HiCdataset(self.refined,
                        self.getGenomeObject(),
                        self.getEnzyme(),
                        1000,
                        mode='r',
                        tmpFolder="\tmp",
                        dictToStoreIDs="h5dict")
        scals = {}
        for chrom in range(self.getGenomeObject().chrmCount):
            for arm in [0, 1]:
                if arm == 0:
                    region = (chrom, 0, self.genomeObject.cntrMids[chrom])
                else:
                    region = (chrom, self.genomeObject.cntrMids[chrom],
                              self.genomeObject.chrmLens[chrom])
                scal = HD.plotScaling(excludeNeighbors=2,
                                      normalize=True,
                                      mindist=2000,
                                      regions=[region])

                scals[(chrom, arm)] = scal
        return scals
Ejemplo n.º 5
0
def doScaling(dataset):
    genome = os.path.split(os.path.split(dataset)[0])[-1]
    a = HiCdataset("bla", os.path.join("../../data/", genome), enzymeName=1000, inMemory=True)
    a.load(dataset)
    # a.load("../hadjurCohesin2012/mm9/AST-WT-AdCre-R1-Hi ndIII_refined.frag")
    a.maskFilter((a.chrms1 == a.chrms2) * (a.strands1 == a.strands2))

    sc = {}
    regions = []
    regions2 = []
    for chromosome in range(0, a.genome.chrmCount):
        cur = [(chromosome, 0, a.genome.cntrMids[chromosome]),
                   (chromosome, a.genome.cntrMids[chromosome],
                    a.genome.chrmLens[chromosome])]
        if chromosome % 2 == 0:
            regions += cur
        else:
            regions2 += cur
    frags = fragdict[genome]
    sc1 = a.plotScaling(excludeNeighbors=2,  mindist=6000, regions=regions,  plot=False, fragids1=frags, fragids2=frags)
    sc2 = a.plotScaling(excludeNeighbors=2,  mindist=6000, regions=regions2, plot=False, fragids1=frags, fragids2=frags)
    sc3 = a.plotScaling(excludeNeighbors=2,  mindist=6000, regions=regions + regions2, plot=False, fragids1=frags, fragids2=frags)
    return sc1, sc2, sc3
Ejemplo n.º 6
0
#filenames = [j for j in os.listdir(path) if j.endswith("SC35-R1-HaeIII_merged.frag")]
filenames = [j for j in os.listdir(path) if j.endswith("_refined.frag")]

genomeName="hg19"
genome_db = getGenome(genomeName)


for fl in sorted(filenames):
    
    
    print fl
    
    out_file = path + fl.split("_refined.frag")[0] + "_refined_wo_man_dupl"
    
    f1 = HiCdataset(path + fl.split("_refined.frag")[0] + "_refined_wo_man_dupl.frag",enzymeName = enzyme,
                    genome=getGenome(workingGenome),tmpFolder = "tmp",dictToStoreIDs="h5dict",
                    mode='w')
    f1.load(path + fl)
    
    
    #f1 = mirnylib.h5dict.h5dict(path + fl ,'r+')  
    
    
    print len(f1.chrms1)
    
  
    
    cuts1 = list(f1.cuts1)
    cuts2 = list(f1.cuts2)
    
    chrms1 = list(f1.chrms1)
Ejemplo n.º 7
0
import subprocess
import argparse
import sys

from mirnylib.genome import Genome
from hiclib.fragmentHiC import HiCdataset


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Sort contacts by position and order the reads of each pair so that all "
        "contacts are upper triangular with respect to the chromosome ordering "
        "given by the chromsizes file."
    )
    parser.add_argument("genome", help="hiclib genome path", metavar="GENOME_PATH")
    parser.add_argument("pairs", help="HDF5 hiclib read pairs file", metavar="PAIRS_PATH")

    args = vars(parser.parse_args())

    genome_db = Genome(args["genome"])
    infile = args["pairs"]
    if args["out"] is not None:
        outfile = args["out"]
        ds = HiCdataset(outfile, genome_db, "HindIII")
        ds.load(infile)
        ds._sortData()
    else:
        outfile = args["out"]
        ds = HiCdataset(infile, genome_db, "HindIII")
        ds._sortData()
Ejemplo n.º 8
0
def plotFigure2c():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    hm = TR.buildHeatmap(1, 1, 1000000, False, False)
    TR.calculateWeights()
    TR.weights = np.ones(len(TR.weights), float)  # if you want to correct just by fragment density, not by length dependence
    hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True)
    hm2[np.isnan(hm2)] = 0
    mask = np.sum(hm, axis=0) > 0
    """p1-6 are 6 lines to be plotted, below is plotting only"""
    p1 = np.sum(hm, axis=0)[mask]
    p3 = np.sum(correct(hm), axis=0)[mask]
    p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask]
    p4 = np.sum(correct(hm2), axis=0)[mask]
    p2 = np.sum(hm2, axis=0)[mask]
    p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask]
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    dashstyle = (3, 3)
    plt.figure(figsize=(4, 4))

    ax = plt.subplot(2, 1, 1)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.ylabel("Total coverage", fontsize=8)

    line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0]
    line22 = plt.plot(
        p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0]
    line22.set_dashes(dashstyle)
    line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(8)
    legend = plt.legend([line21, line22, line23],
                        ["Raw data", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax)

    for i in ax.spines.values():
        i.set_color('none')
    ax.axhline(linewidth=1, color='black')
    ax.axvline(linewidth=1, color='black')

    ax2 = plt.subplot(2, 1, 2, sharex=ax)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.xlabel("Position on chom 1 (MB)", fontsize=8)
    plt.ylabel("Total coverage", fontsize=8)

    line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0]
    line1.set_dashes(dashstyle)
    line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0]
    line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax2.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax2.get_yticklabels():
        xlabel_i.set_fontsize(8)

    legend = plt.legend([line2, line1, line3],
                        ["HindIII corrected", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax2)
    plotting.niceShow()
Ejemplo n.º 9
0
import numpy
np = numpy
from mirnylib import genome
from mirnylib.numutils import adaptiveSmoothing, trunc, ultracorrect
from mirnylib.h5dict import h5dict

genomeDb = genome.Genome('../data/caul',
                         chrmFileTemplate="%s.fa",
                         readChrms=[])

for expName in os.listdir(
        "caul"):  # this directory contains folders with names of experiments.
    # data will be loaded from different folders
    TR = HiCdataset(
        "bla",
        genome=genomeDb,
        inMemory=True,
    )  # inMemory, as data are small (<1e8 reads)
    TR.load("data/" + expName + "_refined.frag")  # load filtered data

    # Now save all heatmaps with different resolutions, etc.
    TR.saveHeatmap("data/" + expName + "-5k_overlap.hm",
                   5000,
                   useFragmentOverlap=True)
    TR.saveHeatmap("data/" + expName + "-10k_overlap.hm",
                   10000,
                   useFragmentOverlap=True)
    TR.saveHeatmap("data/" + expName + "-20k_overlap.hm",
                   20000,
                   useFragmentOverlap=True)
    TR.saveHeatmap("data/" + expName + "-50k.hm", 50000)
Ejemplo n.º 10
0
from mirnylib.systemutils import fmap,setExceptionHook
from mirnylib.genome import Genome 
import numpy as np 
import os
import sys

genomeName = "GalGal5filtered"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
				readChrms=[],
				chrmFileTemplate="N%s.fa")

basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/"
filename = "chunk0001.hdf5"
				
TR = HiCdataset(basefolder+filename+".HiCdataset", genome=genome_db,
                                    maximumMoleculeLength=500,enzymeName = "HindIII",tmpFolder = "tmp",
                                    mode='w')  # remove inMemory if you don't have enough RAM
TR.parseInputData(dictLike=basefolder+filename)
TR.filterDuplicates()
TR.filterLarge(10000,10)
TR.filterExtreme(cutH=0.001, cutL=0)
TR.writeFilteringStats()
TR.printMetadata(saveTo=basefolder+filename+".stat")
TR.saveHeatmap(basefolder+filename+".hm-res-1000kb",1000000)
comment ="""
        #------------------------End set of filters applied----------

    print("----->Building Raw heatmap at different resolutions")
    TR.printStats()
    for res in wholeGenomeResolutionsKb:    
        TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res*1000)
Ejemplo n.º 11
0
#Now merging different experiments alltogether
#note that the first column is not here, as it is a replica
experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames])
print experiments

for experiment in experiments:
    workingGenome = experiment[1]
    myExperimentNames = [
        i[1] + "_refined.frag" for i in combinedExperimentNames
        if (i[0], i[2], i[3]) == (experiment[0], experiment[1], experiment[2])
    ]
    assert len(myExperimentNames) > 0
    if len(myExperimentNames) > 1:
        #If we have more than one experiment (replica) for the same data, we can combine.
        TR = HiCdataset(os.path.join(
            workingGenome,
            "%s-all-%s_refined.frag" % (experiment[0], experiment[2])),
                        genome=genomeFolder(workingGenome))
        statSaveName = os.path.join(
            "statistics", workingGenome,
            "%s-all-%s_refined.stat" % (experiment[0], experiment[2]))

        TR.merge(myExperimentNames)
        TR.printMetadata(saveTo=statSaveName)
        for res in wholeGenomeResolutionsKb:
            TR.saveHeatmap(
                os.path.join(
                    workingGenome, "%s-all-%s-{0}k.hm" %
                    (experiment[0], experiment[2])).format(res), res * 1000)
        for res in byChromosomeResolutionsKb:
            TR.saveByChromosomeHeatmap(
                os.path.join(
Ejemplo n.º 12
0
from mirnylib.genome import Genome

import os

resolution = 10000

genomeName = "mm10"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
                   readChrms=["#", "X", "Y"])

data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/"
fdataset_fname = "mESC-all-NcoI_refined.frag"
setExceptionHook()

print "Loading HiCdataset"
TR = HiCdataset(data_folder + fdataset_fname,
                enzymeName="HindIII",
                mode='r',
                genome=genome_db)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname  + "_res"+ "1000k.hm", 1000000)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname  + "_res"+ "40k.hm", 40000)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "_res" + "10k.hm", 10000,useFragmentOverlap=True)
for resolution in [10000, 25000]:
    print "Saving heatmap"
    TR.saveHiResHeatmapWithOverlaps(
        data_folder + fdataset_fname + "_res" + str(resolution / 1000) +
        "k_hiRes.hm", resolution)
#TR.saveByChromosomeHeatmap(data_folder + fdataset_fname + "_res" + str(resolution/1000)+"k_bychr.hm", resolution=resolution,includeTrans=True)
Ejemplo n.º 13
0
            out.write(f.split(".")[0] + "\t" + str(chr_len) + "\n")
else:
    for i in xrange(genome_db.chrmCount):
        if genome_db.chrmCount > 100:
            out.write("N" + genome_db.idx2label[i] + "\t" +
                      str(genome_db.chrmLens[i]) + "\n")
        else:
            out.write(genome_db.idx2label[i] + "\t" +
                      str(genome_db.chrmLens[i]) + "\n")
out.close()

#Step 2. Load fragments and save it in juicybox format

Fr = HiCdataset(fr_daraset + ".tmp" + str(random.randint(0, 10000)),
                enzymeName=enzyme,
                mode='w',
                genome=genome_db,
                inMemory=True)
Fr.load(fr_daraset)

out = open(out_file_prefix + ".contacts", "w")
print "Reporting ", Fr.N, " contacts"
dotsize = Fr.N / 500
print "one dot is ", dotsize, " contacts"

strands1 = np.array(Fr._getVector("strands1"), dtype=np.int8)
cuts1 = np.array(Fr._getVector("cuts1"), dtype=np.uint64)
chrms1 = np.array(Fr._getVector("chrms1"), dtype=np.uint16)
fragids1 = np.array(Fr._getVector("rfragAbsIdxs1"), dtype=np.uint32)

strands2 = np.array(Fr._getVector("strands2"), dtype=np.int8)
def refine_dataset(filenames, niceval, delete=True, parse_in_memory=True):
    """
    Map the fragments from each replicate to chromosomes (in parallel)

    Parameters
    ----------
    filenames[0] is a list of filenames of incoming files
    filenames[1] is a folder for outgoing file
    filenames[2] is a working genome, that is output directory
    filenames[3] is an enzyme for a given experiment

    create : bool, optional
        If True, parse each file.
        If False, assume that files were already parsed
        (e.g. if you are just playing around with filtering parameters)
    delete : bool, optional
        If True, delete parsed files after merging.
        Man, these files may be huge... if you don't have a 10TB RAID, this may be useful.
    parseInMemory : bool, optional
        Perform parsing input files in memory.
    """

    in_files = filenames[0]
    out_file = filenames[1]

    stat_folder = os.path.join("statistics", out_file)

    working_genome = filenames[2]
    enzyme = filenames[3]

    nice_list = [niceval for i in in_files]
    parse_list = [parse_in_memory for i in in_files]
    genome_list = [working_genome for i in in_files]
    enzyme_list = [enzyme for i in in_files]
    stat_folder_list = [stat_folder for i in in_files]

    #map(parse_onename, in_files)
    Parallel(n_jobs=20)(delayed(parse_mapped_reads)(infile, nice_val, genome, enzyme, stat_folder, parse_val) for infile, nice_val, genome, enzyme, stat_folder, parse_val in
               zip(in_files, nice_list, genome_list, enzyme_list, stat_folder_list, parse_list))

    # Merge in all parsed files from one experiment
    print("Merging files all together, applying filters...")
    TR = HiCdataset(ensure(out_file + "_merged.frag"),
                    genome=genomeFolder(working_genome), enzymeName=enzyme, tmpFolder="tmp", dictToStoreIDs="h5dict",
                    mode="w")
    TR.merge([i + "_parsed.frag" for i in in_files])

    if delete:  # cleaning up parsed files
        for delFile in [i + "_parsed.frag" for i in in_files]:
            os.remove(delFile)
    print("done!")
    print("Filtering merged data...")
    TR = HiCdataset(out_file + "_refined.frag", enzymeName=enzyme,
                    genome=genomeFolder(working_genome), tmpFolder="tmp", dictToStoreIDs="h5dict",
                    mode='w')
    TR.load(out_file + "_merged.frag")


    # ----------------------------Set of filters applied -------------
    TR.filterDuplicates()
    TR.filterLarge(10000, 10)
    TR.filterExtreme(cutH=0.001, cutL=0)
    TR.writeFilteringStats()
    TR.printMetadata(saveTo=stat_folder + ".stat")
    print("done!")
    # ------------------------End set of filters applied----------

    print("Building heatmaps at specified resolutions...")
    TR.printStats()
    for res in whole_genome_resolutions_Kb:
        TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res * 1000)

    for res in by_chromosome_resolutions_Kb:
        TR.saveByChromosomeHeatmap(out_file + "-{0}k.byChr".format(res), res * 1000)

    for res in hi_res_with_overlap_resolutions_Kb:
        TR.saveHiResHeatmapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res * 1000)

    for res in super_hi_res_with_overlap_resolutions_Kb[:-skip]:
        TR.saveSuperHighResMapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res * 1000)
    print("done!")
def parse_mapped_reads(onename, niceval, working_genome, enzyme, stat_folder, in_memory):
    """
    Parse the given h5 mapped reads, output to a partial fragment file.

    :param onename: (string) the name of this fragment
    :param niceval: (int) positive int to rank the priority of this job
    :param working_genome: (string) name of the genome aganist which we mapped the reads
    :param enzyme: (string) name of the restriction enzyme used to cut fragments
    :param stat_folder: (string) folder in which to write fragment mapping stats
    :return: none explicit, h5 file is saved to disk
    """

    # set the niceness of this sub-process:
    os.nice(niceval)

    np.random.seed()
    # Parsing individual files, either in memory or on disk
    if in_memory:
        finalname = onename + "_parsed.frag"
        TR = HiCdataset("bla" + str(np.random.randint(100000000000)), genome=genomeFolder(working_genome),
                        maximumMoleculeLength=500, enzymeName=enzyme, tmpFolder="tmp",
                        inMemory=True)  # remove inMemory if you don't have enough RAM

        TR.parseInputData(dictLike=onename)
        print onename
        TR.save(ensure(finalname))
        folder, fname = os.path.split(onename)
        statSubFolder = os.path.join(stat_folder, folder)

        TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat")))
    else:
        # Create dataset at destination, parse on HDD, then no need to save.
        TR = HiCdataset(ensure(onename + "_parsed.frag"),
                        genome=genomeFolder(working_genome), enzymeName=enzyme, tmpFolder="tmp",
                        maximumMoleculeLength=500, mode='w')
        TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
        TR.printMetadata(saveTo=ensure(os.path.join(stat_folder, onename + ".stat")))
Ejemplo n.º 16
0
def refineDataset(filenames, create=True, delete=True, parseInMemory=True):
    """
    Parameters
    ----------

    filenames[0] is a list of filenames of incoming files
    filenames[1] is a folder for outgoing file
    filenames[2] is a working genome name, which is also the name of output directory

    create : bool, optional
        If True, parse each file.
        If False, assume that files were already parsed
        (e.g. if you are just playing around with filtering parameters)
    delete : bool, optional
        If True, delete parsed files after merging.
        Man, these files may be huge... if you don't have a 10TB RAID, this may be useful.
    parseInMemory : bool, optional
        Perform parsing input files in memory.

    """
    in_files = filenames[0]
    out_file = filenames[1]
    workingGenome = filenames[2]

    if os.path.exists(workingGenome) == False:
        try:
            os.mkdir(workingGenome)
        except:
            print "Cannot create working directory"
            exit()

    if create == True:  # if we need to parse the input files (.hdf5 from mapping).
        for onename in in_files:
            #Parsing individual files
            if not os.path.exists(source(onename)):
                raise StandardError("path not found: %s" % onename)
            if parseInMemory == True:
                #create dataset in memory, parse and then save to destination
                TR = HiCdataset("bla", genome=genomeFolder(workingGenome),
                                maximumMoleculeLength=500, override=True,
                                inMemory=True)  # remove inMemory if you don't have enough RAM

                TR.parseInputData(dictLike=source(onename))
                TR.save(onename + "_parsed.frag")
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(onename + "_parsed.frag",
                                genome=genomeFolder(workingGenome),
                                maximumMoleculeLength=500, override=True)
                TR.parseInputData(dictLike=source(onename))

        "Merging files alltogether, applying filters"
        TR = HiCdataset(out_file + "_merged.frag",
                        genome=genomeFolder(workingGenome),
                        override=True)

        TR.merge([i + "_parsed.frag" for i in in_files])
            #Merge in all parsed files from one experiment

        if delete == True:  # cleaning up parsed files
            for delFile in [i + "_parsed.frag" for i in in_files]:
                os.remove(delFile)
        TR.flush()

        "Now opening new dataset for refined data, and performing all the filtering "
        TR = HiCdataset(out_file + "_refined.frag",
                        genome=genomeFolder(workingGenome),
                        override=True)
        TR.load(out_file + "_merged.frag")
        #----------------------------Set of filters applied -------------
        TR.filterRsiteStart(offset=5)
        TR.filterDuplicates()
        #TR.save(out_file+".dat")
        TR.filterLarge()
        TR.filterExtreme(cutH=0.005, cutL=0)
        #------------------------End set of filters applied----------

    else:
        #If merging & filters has already been done, just load files
        TR = HiCdataset(out_file + "_working.frag",
                        override=True, genome=genomeFolder(workingGenome))
        TR.load(out_file + "_refined.frag")

    print "----->Building Raw heatmap at two resolutions"
    TR.printStats()
    TR.saveHeatmap(out_file + "-200k.hm", 200000)
    TR.saveHeatmap(out_file + "-500k.hm", 500000)
    TR.saveHeatmap(out_file + "-1M.hm", 1000000)
    TR.saveHeatmap(out_file + "-2M.hm", 2000000)

    print "----->Building RB heatmap"
    TR = HiCdataset(out_file + "_breaks.frag", genome=genomeFolder(
        workingGenome), override=True)
    TR.load(out_file + "_refined.frag")
    TR.maskFilter((TR.dists1 > TR.maximumMoleculeLength) + (TR.dists2 >
                                                            TR.maximumMoleculeLength) * TR.DS)
    TR.printStats()
    TR.saveHeatmap(out_file + "-200k-breaks.hm", 200000)
    TR.saveHeatmap(out_file + "-500k-breaks.hm", 500000)
    TR.saveHeatmap(out_file + "-1M-breaks.hm", 1000000)
    TR.saveHeatmap(out_file + "-2M-breaks.hm", 2000000)
Ejemplo n.º 17
0

#Now running refineDataset for each experiment
for i in byExperiment:
    refineDataset(i, create=True, delete=True)

#Now merging different experiments alltogether
experiments = set([(i[0], i[2]) for i in newExperimentNames])


for experiment in experiments:
    workingGenome = experiment[1]
    myExperimentNames = [i[1] + "_refined.frag" for i in newExperimentNames if i[0] == experiment[0]]
    assert len(myExperimentNames) > 0
    if len(myExperimentNames) > 1:
        TR = HiCdataset(os.path.join(workingGenome, "%s-all_refined.frag" %
                                     experiment[0]), genome=genomeFolder(workingGenome))
        TR.merge(myExperimentNames)
        TR.saveHeatmap(os.path.join(
            workingGenome, "%s-all-100k.hm" % experiment[0]), 100000)
        TR.saveHeatmap(os.path.join(
            workingGenome, "%s-all-200k.hm" % experiment[0]), 200000)
        TR.saveHeatmap(os.path.join(
            workingGenome, "%s-all-500k.hm" % experiment[0]), 500000)
        TR.saveHeatmap(os.path.join(
            workingGenome, "%s-all-1M.hm" % experiment[0]), 1000000)



#map(refine_paper,
#        [((source("SRR027961"),
#       source("SRR027960")),   os.path.join(workingGenome, "GM-NcoI-%s" % workingGenome ),"NcoI"),
Ejemplo n.º 18
0
chrms1 = np.random.randint(0,22,N)
chrms2 = chrms1.copy()
mask = np.random.random(N) < 0.5
chrms2[mask] = np.random.randint(0,22,mask.sum())
pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int)
offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 )
pos2 = np.array(pos1 + offset1, dtype=int)

strands1 = np.random.random(N) > 0.5
strands2 = np.random.random(N) > 0.5



mydict = {"chrms1":chrms1, "chrms2":chrms2,"cuts1":pos1,"cuts2":pos2,"strands1":strands1,"strands2":strands2}

TR = HiCdataset("bla", genome=mygenome, enzymeName="MboI",maximumMoleculeLength=500, inMemory=True)
print "\nTesting loading new data without rsite information    "
TR.parseInputData(dictLike=mydict,
                  enzymeToFillRsites="MboI")
TR.filterLarge(cutlarge=50000, cutsmall=100)

sc = TR.plotScaling()
print sc
plt.title("Scaling should be 1/x")
plt.plot(*sc)
plt.xscale("log")
plt.yscale("log")
plt.show()


Ejemplo n.º 19
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla",
                            genome=genomeFolder,
                            enzymeName="HindIII",
                            maximumMoleculeLength=500,
                            inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",
                        enzymeName="HindIII",
                        genome=genomeFolder,
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined",
                        genome=genomeFolder,
                        enzymeName="HindIII",
                        mode="w",
                        inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize=30000)

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845

    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(
                i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")

    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm",
                               resolution=1000000,
                               includeTrans=True,
                               countDiagonalReads="once")

    TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm",
                                    resolution=50000,
                                    countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm",
                                       resolution=5000,
                                       chromosomes=[14],
                                       countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"

    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i, i] = 2 * newchrom1[i, i]

    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) *
         (removeChromIDs[TR.chrms2] == 1)).sum() + (
             (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder,
                       readChrms=["2", "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Ejemplo n.º 20
0
"""

from hiclib.fragmentHiC import HiCdataset
import os

from mirnylib import genome

genomeDb = genome.Genome('../data/caul',
                         chrmFileTemplate="%s.fa",
                         readChrms=[])

for expName in os.listdir("caul"):

    TR = HiCdataset(
        "bla",
        genome=genomeDb,
        inMemory=True,
    )  # inMemory, as files are probably small (less than hundreds mililon reads)
    TR.parseInputData("caul/" + expName,
                      removeSS=True)  # We discard SS in our pipeline now
    TR.printMetadata()
    TR.filterRsiteStart(
        offset=5
    )  # We still do this filter to avoid strange "dangling end-like" molecules
    TR.filterDuplicates()
    #TR.save(out_file+".dat")
    TR.filterLarge(
        cutlarge=300000, cutsmall=100
    )  #Don't filter any large fragments. This was relevant for eucaryotes with
    #their megabase-long stretches of repetitive or unmappable regions
    #TR.filterExtreme(cutH=0.0025, cutL=0)                    #ALl fragments in Caulobacter seemed to behave normally
Ejemplo n.º 21
0
    pass


#Now merging different experiments alltogether
#note that the first column is not here, as it is a replica 
experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames])
print(experiments)

for experiment in experiments:
    workingGenome = experiment[1]
    myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])]    
    assert len(myExperimentNames) > 0
    if len(myExperimentNames) > 0:
        #If we have more than one experiment (replica) for the same data, we can combine. 
        TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" %
                                     (experiment[0],experiment[2])), genome=getGenome(workingGenome),
                                     enzymeName = experiment[2],tmpFolder = "tmp",dictToStoreIDs="h5dict")
        statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2]))

        TR.merge(myExperimentNames)
        TR.printMetadata(saveTo=statSaveName)
        for res in wholeGenomeResolutionsKb:    
            TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in byChromosomeResolutionsKb: 
            TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in HiResWithOverlapResolutionsKb:
            TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in SuperHiResWithOverlapResolutionsKb:
            TR.saveSuperHighResMapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_SuperHighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)

Ejemplo n.º 22
0
def plotFigure2c():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    hm = TR.buildHeatmap(1, 1, 1000000, False, False)
    TR.calculateWeights()
    TR.weights = np.ones(
        len(TR.weights), float
    )  # if you want to correct just by fragment density, not by length dependence
    hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True)
    hm2[np.isnan(hm2)] = 0
    mask = np.sum(hm, axis=0) > 0
    """p1-6 are 6 lines to be plotted, below is plotting only"""
    p1 = np.sum(hm, axis=0)[mask]
    p3 = np.sum(correct(hm), axis=0)[mask]
    p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask]
    p4 = np.sum(correct(hm2), axis=0)[mask]
    p2 = np.sum(hm2, axis=0)[mask]
    p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask]
    matplotlib.rcParams['font.sans-serif'] = 'Arial'
    dashstyle = (3, 3)
    plt.figure(figsize=(4, 4))

    ax = plt.subplot(2, 1, 1)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.ylabel("Total coverage", fontsize=8)

    line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0]
    line22 = plt.plot(p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0]
    line22.set_dashes(dashstyle)
    line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax.get_yticklabels():
        xlabel_i.set_fontsize(8)
    legend = plt.legend(
        [line21, line22, line23],
        ["Raw data", "Single correction", "Iterative correction"],
        prop={"size": 6},
        loc=1,
        handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax)

    for i in ax.spines.values():
        i.set_color('none')
    ax.axhline(linewidth=1, color='black')
    ax.axvline(linewidth=1, color='black')

    ax2 = plt.subplot(2, 1, 2, sharex=ax)
    plt.xlim((0, 80))
    plt.ylim((0, 2))
    plt.xlabel("Position on chom 1 (MB)", fontsize=8)
    plt.ylabel("Total coverage", fontsize=8)

    line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0]
    line1.set_dashes(dashstyle)
    line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0]
    line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0]

    for xlabel_i in ax2.get_xticklabels():
        xlabel_i.set_fontsize(8)
    for xlabel_i in ax2.get_yticklabels():
        xlabel_i.set_fontsize(8)

    legend = plt.legend(
        [line2, line1, line3],
        ["HindIII corrected", "Single correction", "Iterative correction"],
        prop={"size": 6},
        loc=1,
        handlelength=2)
    legend.draw_frame(False)
    removeAxes(shift=0, ax=ax2)
    plotting.niceShow()
Ejemplo n.º 23
0
chrms1 = np.random.randint(0,22,N)
chrms2 = chrms1.copy()
mask = np.random.random(N) < 0.5
chrms2[mask] = np.random.randint(0,22,mask.sum())
pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int)
offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 )
pos2 = np.array(pos1 + offset1, dtype=int)

strands1 = np.random.random(N) > 0.5
strands2 = np.random.random(N) > 0.5



mydict = {"chrms1":chrms1, "chrms2":chrms2,"cuts1":pos1,"cuts2":pos2,"strands1":strands1,"strands2":strands2}

TR = HiCdataset("bla", genome=mygenome, enzymeName="MboI",maximumMoleculeLength=500, inMemory=True)
print("\nTesting loading new data without rsite information    ")
TR.parseInputData(dictLike=mydict,
                  enzymeToFillRsites="MboI")
TR.filterLarge(cutlarge=50000, cutsmall=100)

sc = TR.plotScaling()

g = mygenome

print(sc)
plt.title("Scaling should be 1/x")
plt.plot(*sc)

plt.xscale("log")
plt.yscale("log")
Ejemplo n.º 24
0
def refineDataset(filenames, create=True, delete=False, parseInMemory=True):
    """
    Parameters
    ----------

    filenames[0] is a list of filenames of incoming files
    filenames[1] is a folder for outgoing file
    filenames[2] is a working genome, that is output directory
    filenames[3] is an enzyme for a given experiment


    create : bool, optional
        If True, parse each file.
        If False, assume that files were already parsed
        (e.g. if you are just playing around with filtering parameters)
    delete : bool, optional
        If True, delete parsed files after merging.
        Man, these files may be huge... if you don't have a 10TB RAID, this may be useful.
    parseInMemory : bool, optional
        Perform parsing input files in memory.

    """
    in_files = filenames[0]
    out_file = filenames[1]

    statFolder = os.path.join("statistics", out_file)

    workingGenome = filenames[2]
    enzyme = filenames[3]

    if create == True:  # if we need to parse the input files (.hdf5 from mapping).

        def parse_onename(onename):
            np.random.seed()
            #Parsing individual files
            if parseInMemory == True:
                finalname = onename + "_parsed.frag"
                #if not os.path.exists(finalname):
                if True:

                    #create dataset in memory, parse and then save to destination
                    TR = HiCdataset(
                        "bla" + str(np.random.randint(100000000000)),
                        genome=getGenome(workingGenome),
                        maximumMoleculeLength=500,
                        enzymeName=enzyme,
                        tmpFolder="tmp",
                        inMemory=True
                    )  # remove inMemory if you don't have enough RAM

                    TR.parseInputData(dictLike=onename)
                    folder = os.path.split(onename)[0]
                    print(onename)
                    TR.save(ensure(finalname))
                    folder, fname = os.path.split(onename)
                    statSubFolder = os.path.join(statFolder, folder)

                    TR.printMetadata(saveTo=ensure(
                        os.path.join(statSubFolder, fname + ".stat")))
                else:
                    print("skipping parsed: ", onename)
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(ensure(onename + "_parsed.frag"),
                                genome=getGenome(workingGenome),
                                enzymeName=enzyme,
                                tmpFolder="tmp",
                                maximumMoleculeLength=500,
                                mode='w')
                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                TR.printMetadata(
                    saveTo=ensure(os.path.join(statFolder, onename + ".stat")))

        list(map(parse_onename, in_files))
        "Merging files alltogether, applying filters"
        TR = HiCdataset(ensure(out_file + "_merged.frag"),
                        genome=getGenome(workingGenome),
                        enzymeName=enzyme,
                        tmpFolder="tmp",
                        dictToStoreIDs="h5dict",
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in in_files])
        #Merge in all parsed files from one experiment

        if delete == True:  # cleaning up parsed files
            for delFile in [i + "_parsed.frag" for i in in_files]:
                os.remove(delFile)

        "Now opening new dataset for refined data, and performing all the filtering "
        TR = HiCdataset(out_file + "_refined.frag",
                        enzymeName=enzyme,
                        genome=getGenome(workingGenome),
                        tmpFolder="tmp",
                        dictToStoreIDs="h5dict",
                        mode='w')
        TR.load(out_file + "_merged.frag")

        #----------------------------Set of filters applied -------------
        TR.filterDuplicates()
        #TR.save(out_file+".dat")
        #TR.filterExtreme(cutH=0.0001, cutL=0)
        #TR.filterRsiteStart()
        #TR.filterLarge()
        TR.writeFilteringStats()
        TR.printMetadata(saveTo=statFolder + ".stat")

        #------------------------End set of filters applied----------

    else:
        #If merging & filters has already been done, just load files
        TR = HiCdataset(out_file + "_working.frag",
                        enzymeName=enzyme,
                        mode='w',
                        genome=getGenome(workingGenome))
        TR.load(out_file + "_refined.frag")
        TR.printMetadata(saveTo=statFolder + ".stat")

    print("----->Building Raw heatmap at different resolutions")
    TR.printStats()
    for res in coolerResolutions:
        TR.saveCooler(out_file + ".{0}.cool".format(res), res)
Ejemplo n.º 25
0
def doOne(inData):
    coolResolutions = [
        10000000, 5000000, 2000000, 1000000, 500000, 200000, 100000, 40000,
        20000, 10000, 5000, 2000, 1000
    ]
    i, j = inData
    if i == "":
        return
    # if i not in ["total", "pronuc", "K562"]:
    #    continue
    print(i)
    gens = j["genome"]
    if len(gens) == 0:
        print("Genome not found")
        return
    genome = gens.values[0]
    out_file = "{1}/{0}_combined".format(i, genome)
    # if os.path.exists(out_file + "-10k_HighRes.byChr"):
    #    continue
    mygen = "/home/magus/HiC2011/data/{0}".format(genome)
    filenames = [
        "{1}/{0}_refined.frag".format(s, genome) for s in j["filenames"].values
    ]

    # assert False not in list(map(os.path.exists, filenames))
    filenames = [i for i in filenames if os.path.exists(i)]
    if len(filenames) == 0:
        print("No filenames found!")
        return
    TR = HiCdataset("bla",
                    mygen,
                    "DpnII",
                    inMemory=True,
                    tmpFolder="/tmp",
                    dictToStoreIDs="dict")
    TR.merge(filenames)
    TR.setSimpleHighResHeatmap()
    TR.writeFilteringStats()
    TR.printMetadata(
        saveTo="statistics/{1}/{0}_combined.frag".format(i, genome))
    pickle.dump(
        TR.metadata,
        open("statistics/{1}/{0}_combined.pkl".format(i, genome), 'wb'))
    TR.save("{1}/{0}_combined_refined.frag".format(i, genome))
    for res in coolResolutions:
        TR.saveCooler(out_file + ".{0}.cool".format(res), res)
Ejemplo n.º 26
0
from hiclib.fragmentHiC import HiCdataset
from mirnylib.systemutils import fmap, setExceptionHook
from mirnylib.genome import Genome

import os
#from defineGenome import getGenome

genomeName = "mm10"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
                   readChrms=["#", "X", "Y"])

data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/"
fdataset_fname = "mESC-all-HindIII_refined.frag"
setExceptionHook()

print "Loading HiCdataset"
TR = HiCdataset(data_folder + fdataset_fname,
                enzymeName="HindIII",
                mode='r',
                genome=genome_db)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000)
print "Saving heatmap"
TR.saveHeatmap(data_folder + fdataset_fname + "25k.hm",
               25000,
               useFragmentOverlap=True)
#print "Saving Hi-Res heatmap"
#TR.saveHiResHeatmapWithOverlaps(data_folder + fdataset_fname + "10k_hiRes.hm", 10000)
Ejemplo n.º 27
0
def doSupplementaryCoveragePlot():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    s1 = TR.fragmentSum(strands=1)
    TR.saveFragments()
    TR.maskFilter(TR.dists1 > 500)
    TR.originalFragments()
    s2 = TR.fragmentSum(strands=1)
    resolution = 1000000

    def coverage(s1, s2, TR):
        genome = Genome()
        genome.createMapping(resolution)
        label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + (
            TR.ufragments % TR.fragIDmult) / resolution
        counts = np.bincount(label, weights=s1)
        counts2 = np.bincount(label, weights=s2)
        data = cPickle.load(open("GC1M", 'rb'))
        eigenvector = np.zeros(genome.chromosomeEnds[-1], float)
        inds = np.argsort(counts)
        mask = inds[int(0.02 * len(inds)):]
        for chrom in range(1, 24):
            eigenvector[genome.chromosomeStarts[chrom - 1]:genome.
                        chromosomeStarts[chrom - 1] +
                        len(data[chrom - 1])] = data[chrom - 1]
        eigenvector[eigenvector < 35] = 35
        plt.scatter(counts[mask],
                    counts2[mask],
                    c=eigenvector[mask],
                    s=6,
                    linewidth=0)
        print stats.spearmanr(counts[mask], counts2[mask])
        plt.xlabel("Coverage from all reads")
        plt.xticks([0, 5000, 10000, 15000])
        plt.ylabel("Coverage from RBs")
        b = plt.colorbar()
        b.ax.set_xlabel("GC content")

    plt.subplot(121)
    plt.title("HinIII")
    coverage(s1, s2, TR)

    TR = HiCdataset()
    TR.load("GM-NcoI.refined")
    s1 = TR.fragmentSum(strands=1)
    TR.saveFragments()
    TR.maskFilter(TR.dists1 > 500)
    TR.originalFragments()
    s2 = TR.fragmentSum(strands=1)
    resolution = 1000000
    plt.subplot(122)
    plt.title("NcoI")
    coverage(s1, s2, TR)
    plt.show()
Ejemplo n.º 28
0
#Now running refineDataset for each experiment
for i in byExperiment:
    print i
    refineDataset(i, create=True, delete=True)


#Now merging different experiments alltogether
#note that the first column is not here, as it is a replica 
experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames])
print experiments

for experiment in experiments:
    workingGenome = experiment[1]
    myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])]    
    assert len(myExperimentNames) > 0
    if len(myExperimentNames) > 1:
        #If we have more than one experiment (replica) for the same data, we can combine. 
        TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" %
                                     (experiment[0],experiment[2])), genome=genomeFolder(workingGenome))
        statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2]))

        TR.merge(myExperimentNames)
        TR.printMetadata(saveTo=statSaveName)
        for res in wholeGenomeResolutionsKb:    
            TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in byChromosomeResolutionsKb: 
            TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
        for res in HiResWithOverlapResolutionsKb:
            TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)

Ejemplo n.º 29
0
        def parse_onename(onename):
            np.random.seed()
            #Parsing individual files
            if parseInMemory == True:
                finalname = onename + "_parsed.frag"
                #if not os.path.exists(finalname):
                if True:

                    #create dataset in memory, parse and then save to destination
                    TR = HiCdataset(
                        "bla" + str(np.random.randint(100000000000)),
                        genome=getGenome(workingGenome),
                        maximumMoleculeLength=500,
                        enzymeName=enzyme,
                        tmpFolder="tmp",
                        inMemory=True
                    )  # remove inMemory if you don't have enough RAM

                    TR.parseInputData(dictLike=onename)
                    folder = os.path.split(onename)[0]
                    print(onename)
                    TR.save(ensure(finalname))
                    folder, fname = os.path.split(onename)
                    statSubFolder = os.path.join(statFolder, folder)

                    TR.printMetadata(saveTo=ensure(
                        os.path.join(statSubFolder, fname + ".stat")))
                else:
                    print("skipping parsed: ", onename)
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(ensure(onename + "_parsed.frag"),
                                genome=getGenome(workingGenome),
                                enzymeName=enzyme,
                                tmpFolder="tmp",
                                maximumMoleculeLength=500,
                                mode='w')
                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                TR.printMetadata(
                    saveTo=ensure(os.path.join(statFolder, onename + ".stat")))
Ejemplo n.º 30
0
from mirnylib.systemutils import fmap, setExceptionHook
from mirnylib.genome import Genome

import os
#from defineGenome import getGenome

genomeName = "mm10"
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
                   readChrms=["#", "X", "Y"])

data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/"
fdataset_fname = "mESC-all-HindIII_refined.frag"
setExceptionHook()

print "Loading HiCdataset"
TR = HiCdataset(data_folder + fdataset_fname,
                enzymeName="HindIII",
                mode='r',
                genome=genome_db)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000)
#print "Saving heatmap"
#TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000)
print "Saving heatmap"
TR.saveHeatmap(data_folder + fdataset_fname + "15k.hm",
               15000,
               useFragmentOverlap=True)
print "Saving Hi-Res heatmap"
TR.saveHiResHeatmapWithOverlaps(data_folder + fdataset_fname + "10k_hiRes.hm",
                                10000)
Ejemplo n.º 31
0
def doSupplementaryCoveragePlot():
    TR = HiCdataset()
    TR.load("GM-all.refined")
    s1 = TR.fragmentSum(strands=1)
    TR.saveFragments()
    TR.maskFilter(TR.dists1 > 500)
    TR.originalFragments()
    s2 = TR.fragmentSum(strands=1)
    resolution = 1000000

    def coverage(s1, s2, TR):
        genome = Genome()
        genome.createMapping(resolution)
        label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult -
            1] + (TR.ufragments % TR.fragIDmult) / resolution
        counts = np.bincount(label, weights=s1)
        counts2 = np.bincount(label, weights=s2)
        data = cPickle.load(open("GC1M", 'rb'))
        eigenvector = np.zeros(genome.chromosomeEnds[-1], float)
        inds = np.argsort(counts)
        mask = inds[int(0.02 * len(inds)):]
        for chrom in range(1, 24):
            eigenvector[genome.chromosomeStarts[chrom - 1]:genome.chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1]
        eigenvector[eigenvector < 35] = 35
        plt.scatter(counts[mask], counts2[mask], c=eigenvector[
            mask], s=6, linewidth=0)
        print stats.spearmanr(counts[mask], counts2[mask])
        plt.xlabel("Coverage from all reads")
        plt.xticks([0, 5000, 10000, 15000])
        plt.ylabel("Coverage from RBs")
        b = plt.colorbar()
        b.ax.set_xlabel("GC content")
    plt.subplot(121)
    plt.title("HinIII")
    coverage(s1, s2, TR)

    TR = HiCdataset()
    TR.load("GM-NcoI.refined")
    s1 = TR.fragmentSum(strands=1)
    TR.saveFragments()
    TR.maskFilter(TR.dists1 > 500)
    TR.originalFragments()
    s2 = TR.fragmentSum(strands=1)
    resolution = 1000000
    plt.subplot(122)
    plt.title("NcoI")
    coverage(s1, s2, TR)
    plt.show()
Ejemplo n.º 32
0
"""
This scripts is a rip-off of a large mergeDatasets script with certain adjustments.
Follow comments along the text.
"""

from hiclib.fragmentHiC import HiCdataset
import os

from mirnylib import genome

genomeDb = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[])

for expName in os.listdir("caul"):

    TR = HiCdataset("bla", genome=genomeDb, inMemory=True,)  # inMemory, as files are probably small (less than hundreds mililon reads)
    TR.parseInputData("caul/" + expName, removeSS=True)  # We discard SS in our pipeline now
    TR.printMetadata()
    TR.filterRsiteStart(offset=5)  # We still do this filter to avoid strange "dangling end-like" molecules
    TR.filterDuplicates()
    #TR.save(out_file+".dat")
    TR.filterLarge(cutlarge=300000, cutsmall=100)  #Don't filter any large fragments. This was relevant for eucaryotes with
                                                              #their megabase-long stretches of repetitive or unmappable regions
    #TR.filterExtreme(cutH=0.0025, cutL=0)                    #ALl fragments in Caulobacter seemed to behave normally
    TR.writeFilteringStats()
    TR.printMetadata(saveTo=statFolder + ".stat")
    TR.save("data/" + expName + "_refined.frag")  #Saving filtered dataset

#Below, saving all datasets at different resolutions.
#Also, using new feature - fragment overlaps - which assins reads to all bins the fragment crosses.

    TR.saveHeatmap("data/" + expName + "-5k_overlap.hm", 5000, useFragmentOverlap=True)
Ejemplo n.º 33
0
def refineDataset(filenames, create=True, delete=True, parseInMemory=True):
    """
    Parameters
    ----------

    filenames[0] is a list of filenames of incoming files
    filenames[1] is a folder for outgoing file
    filenames[2] is a working genome, that is output directory
    filenames[3] is an enzyme for a given experiment


    create : bool, optional
        If True, parse each file.
        If False, assume that files were already parsed
        (e.g. if you are just playing around with filtering parameters)
    delete : bool, optional
        If True, delete parsed files after merging.
        Man, these files may be huge... if you don't have a 10TB RAID, this may be useful.
    parseInMemory : bool, optional
        Perform parsing input files in memory.

    """
    in_files = filenames[0]
    out_file = filenames[1]

    statFolder = os.path.join("statistics", out_file)

    workingGenome = filenames[2]
    enzyme = filenames[3]


    if create == True:  # if we need to parse the input files (.hdf5 from mapping).
        for onename in in_files:
            #Parsing individual files
            if parseInMemory == True:
                #create dataset in memory, parse and then save to destination
                TR = HiCdataset("bla", genome=genomeFolder(workingGenome),
                                maximumMoleculeLength=500,
                                inMemory=True)  # remove inMemory if you don't have enough RAM

                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                folder = os.path.split(onename)[0]
                print onename
                TR.save(ensure(onename + "_parsed.frag"))
                folder, fname = os.path.split(onename)
                statSubFolder = os.path.join(statFolder, folder)

                TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat")))
            else:
                #Create dataset at destination, parse on HDD, then no need to save.
                TR = HiCdataset(ensure(onename + "_parsed.frag"),
                                genome=genomeFolder(workingGenome),
                                maximumMoleculeLength=500, mode='w')
                TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme)
                TR.printMetadata(saveTo=ensure(os.path.join(statFolder, onename + ".stat")))

        "Merging files alltogether, applying filters"
        TR = HiCdataset(ensure(out_file + "_merged.frag"),
                        genome=genomeFolder(workingGenome),
                        mode="w")
        TR.merge([i + "_parsed.frag" for i in in_files])
            #Merge in all parsed files from one experiment

        if delete == True:  # cleaning up parsed files
            for delFile in [i + "_parsed.frag" for i in in_files]:
                os.remove(delFile)

        "Now opening new dataset for refined data, and performing all the filtering "
        TR = HiCdataset(out_file + "_refined.frag",
                        genome=genomeFolder(workingGenome),
                        mode='w')
        TR.load(out_file + "_merged.frag")

        #----------------------------Set of filters applied -------------
        TR.filterRsiteStart(offset=5)
        TR.filterDuplicates()
        #TR.save(out_file+".dat")
        TR.filterLarge()
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()
        TR.printMetadata(saveTo=statFolder + ".stat")

        #------------------------End set of filters applied----------

    else:
        #If merging & filters has already been done, just load files
        TR = HiCdataset(out_file + "_working.frag",
                        mode='w', genome=genomeFolder(workingGenome))
        TR.load(out_file + "_refined.frag")
        TR.printMetadata(saveTo=statFolder + ".stat")

    print "----->Building Raw heatmap at different resolutions"
    TR.printStats()
    for res in wholeGenomeResolutionsKb:    
        TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res*1000)
    for res in byChromosomeResolutionsKb: 
        TR.saveByChromosomeHeatmap(out_file + "-{0}k.byChr".format(res), res*1000)
    for res in HiResWithOverlapResolutionsKb:
        TR.saveHiResHeatmapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res*1000)
Ejemplo n.º 34
0
def heatmapFromHotFragments(dataset="../../../mouse/data/combined/"\
                    "mouse1_merged.frag",
                    workingFile="../../../tcc/working/workingMouse.frag",
                    cacheFile="../../../tcc/working/workingMouseFiltered.frag",
                    genomeFolder="../../../data/mm9",
                    label=""):
    mirnylib.systemutils.setExceptionHook()

    if not os.path.exists(cacheFile):
        FH = HiCdataset(workingFile, genomeFolder)
        FH.load(dataset)
        FH.filterRsiteStart(offset=5)
        FH.filterDuplicates()
        #TR.save(filename[1]+".dat")
        FH.filterLarge()
        FH.maskFilter(FH.DS)
        FH.save(cacheFile)

    FH = HiCdataset(workingFile, genomeFolder)
    FH.load(cacheFile)
    fs = FH.fragmentSum()
    p996, p998 = numpy.percentile(fs, [99.6, 99.8])
    frag96 = FH.ufragments[fs > p996]
    frag98 = FH.ufragments[(fs > p998)]

    FH.maskFilter(arrayInArray(
        FH.fragids1, frag96) + arrayInArray(FH.fragids2, frag96))
    hm98100 = FH.buildAllHeatmap(5000000)
    FH.maskFilter(arrayInArray(
        FH.fragids1, frag98) + arrayInArray(FH.fragids2, frag98))
    hm99100 = FH.buildAllHeatmap(5000000)

    plt.subplot(121)

    plt.imshow(numpy.log(hm99100 + 1), interpolation="nearest")
    plt.colorbar()
    plt.title("log # counts for top .2 % of fragments, " + label)

    plt.subplot(122)
    plt.imshow(numpy.log(hm98100 - hm99100 + 1), interpolation="nearest")
    plt.title("log # counts for second top .2%  (.996 - .998), " + label)
    plt.colorbar()
    plt.show()
Ejemplo n.º 35
0
def refine_paper(filename, create=True):
    """filename[0] is a list of filenames of incoming files
    filename[1] is a folder for outgoing file"""
    if create == True:
        for onename in filename[0]:
            #Parsing individual files
            if not os.path.exists(onename):
                raise StandardError("path not found: %s" % onename)
            TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True)
            print "\nTesting loading new data without rsite information    "
            TR.parseInputData(dictLike=onename,
                              enzymeToFillRsites="HindIII")
            #assert len(TR.DS) == 856143

            #assert len(TR.ufragments) == 634572
            TR.save(onename + "_parsed.frag")

        #Merging files alltogether, applying filters
        TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII",
                        genome=genomeFolder, mode="w")
        TR.merge([i + "_parsed.frag" for i in filename[0]])

        TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII",
                        mode="w", inMemory=True)

        print "\nTesting chunking during all tests"
        TR.chunksize = 30000
        #because we do many operations, we disable autoFlush here
        TR.load(filename[1] + "_merged.frag")

        print "\nTesting Rsite filter"
        TR.filterRsiteStart(offset=5)

        #assert len(TR.DS) == 832110

        print "\nTesting duplicate filter"
        TR.filterDuplicates(chunkSize = 30000)        

        #assert len(TR.DS) == 830275

        print "\nTesting small/large and extreme fragment filter"
        TR.filterLarge()

        #assert len(TR.DS) == 825442
        TR.filterExtreme(cutH=0.005, cutL=0)
        TR.writeFilteringStats()

        #assert len(TR.DS) == 803845


    #-------------------------------------------
    TR.printMetadata(saveTo="metadata")
    import cPickle

    stop = False
    mdata = cPickle.load(open("sampleMetadata"))
    for i in sorted(mdata.keys()):
        if TR.metadata[i] != mdata[i]:
            print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i])
            stop = True
    if stop == True:
        print ("""------------_ERROR_--------------
        Inconsistent metadata: see above
        ----------------------------------------""")
        raise ValueError("Inconsistent Metadata")


    print "Testing allxall and by-chromosome heatmap counting diagonal twice"

    print "----> saving allxall heatmap"
    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="twice")
    a = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = a["heatmap"][st:end, st:end]
    chrom12 = a["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="twice")

    b = h5dict(filename[1] + "-1M.hm")["1 1"]
    bb = h5dict(filename[1] + "-1M.hm")["1 2"]
    assert (b - chrom1).sum() == 0
    print "Cis heatmap consistent"
    assert (bb - chrom12).sum() == 0
    print 'Trans heatmap consistent'
    print  a["heatmap"][::10, ::10].sum()
    #assert  a["heatmap"][::10, ::10].sum() == 21800
    print "Heatmap sum correct\n"

    #---------------------------------
    print "Testing allxall and by-chromosome heatmap counting diagonal once"

    TR.saveHeatmap(filename[1] + "-1M.hm", 1000000,
                   countDiagonalReads="once")
    Ta = h5dict(filename[1] + "-1M.hm")
    st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1]
    st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2]
    chrom1 = Ta["heatmap"][st:end, st:end]
    chrom12 = Ta["heatmap"][st:end, st2:end2]
    setExceptionHook()
    print "----> saving by chromosome heatmap"
    TR.saveByChromosomeHeatmap(
        filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True,
        countDiagonalReads="once")
    
    TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice")
    TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice")

    Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"]
    Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"]
    assert ((Tb - chrom1) == 0).all()
    assert ((Tbb - chrom12) == 0).all()
    assert ((Tb + np.diag(np.diag(Tb))) == b).all()
    print "Diagonal counting methods are consistent\n"
    
    newchrom1 = chrom1.copy()
    for i in xrange(len(newchrom1)):
        newchrom1[i,i] = 2 * newchrom1[i,i]
    
    Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"]
    assert np.abs(Tb.sum() - newchrom1.sum()) < 1
    assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500
    

    #------------------------------
    print "Testing updateGenome method"
    from mirnylib.genome import Genome
    removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0])
    #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum()
    t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum()
    newGenome = Genome(genomePath=genomeFolder, readChrms=["2",
                                                           "3", "4", "5", "X"])
    TR.updateGenome(newGenome)
    assert  TR.N == t

    a = h5dict(filename[1] + "-1M.hm")["heatmap"]
Ejemplo n.º 36
0
def plotCisToTransHotFragments(dataset="../../../mouse/data/combined/"\
                   "mouse1_merged.frag",
                   workingFile="../../../tcc/working/workingMouse.frag",
                   cacheFile="../../../tcc/working/workingMouseFiltered.frag",
                   genomeFolder="../../../data/mm9", label=None):
    mirnylib.systemutils.setExceptionHook()
    if not os.path.exists(cacheFile):
        print "caching parsed data"
        FH = HiCdataset(workingFile, genomeFolder)
        FH.load(dataset)
        FH.filterRsiteStart(offset=5)
        FH.filterDuplicates()
        #TR.save(filename[1]+".dat")
        FH.filterLarge()
        FH.maskFilter(FH.DS)
        FH.save(cacheFile)

    FH = HiCdataset(workingFile, genomeFolder)
    FH.load(cacheFile)

    fs = FH.fragmentSum()

    FH.saveFragments()
    FH.maskFilter(FH.chrms1 == FH.chrms2)

    FH.originalFragments()

    fsCis = FH.fragmentSum()
    args = numpy.argsort(fs)

    fsSort = 1. * fs[args]
    fsCisSort = 1. * fsCis[args]

    cisToTrans = fsCisSort / fsSort

    p1, p2, p3 = numpy.percentile(fsSort, [99, 99.5, 99.9])

    bins = mirnylib.numutils.logbins(1, fsSort.max(), 1.08)
    counts = numpy.histogram(fsSort, bins)
    values = numpy.histogram(fsSort, bins, weights=cisToTrans)

    plt.plot(0.5 * (values[1][:-1] + values[1][1:]), values[0] /
             counts[0], '.', label=label)

    for linep in p1, p2, p3:
        plt.vlines(linep, 0, 1)

    plt.xlabel("Counts per fragment")
    plt.ylabel("Cis-to-trans ratio")
    plt.title("Vertical lines are at 99%,99.5% and 99.9% reads per fragment")

    niceShow()