def plotCrossValidation(): "main figure subplot with corss-validation" matplotlib.rcParams['font.sans-serif'] = 'Arial' plt.figure(figsize=(1, 1)) FG = HiCdataset(workingFile1, myGenome) FG.load(GMFrag) Tanay = binnedData(1000000) Tanay.simpleLoad("GM-all-10p", "GM-1") #need to create these datasets using fragment-level analysis Tanay.simpleLoad("GM-all-90p", "GM-9") Tanay.removePoorRegions() Tanay.iterativeCorrectWithSS() Tanay.removeZeros() b1, b2 = (Tanay.biasDict["GM-1"], Tanay.biasDict["GM-9"]) cPickle.dump((b1, b2), open("CrossValidatioN", 'wb')) ax = plt.gca() b1, b2 = cPickle.load(open("CrossValidatioN", 'rb')) print cr(b1, b2) plt.scatter(b1, b2, s=.7, color="k", linewidth=0) plt.xlabel(r"10% reads", fontsize=8) plt.ylabel(r"90% reads", fontsize=8) plt.xlim((0, 1.5)) plt.ylim((0, 1.5)) plt.xticks([0, 0.5, 1, 1.5]) plt.yticks([0, 0.5, 1, 1.5]) removeAxes(shift=0) fs = 6 for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(fs) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(fs) plt.show()
def getScaling(self): HD = HiCdataset(self.refined, self.getGenomeObject(), self.getEnzyme(), 1000, mode='r', tmpFolder="\tmp", dictToStoreIDs="h5dict") scal = HD.plotScaling(excludeNeighbors=2, normalize=True, mindist=2000) return scal
def getFrags(): """ A method that calculates a set of restriction fragments covered in all hg19, or in all mm9 datasets Used for the correct calculation of scalings """ mouse = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/mm9/oocyte_combined_refined.frag" human = "/home/magus/HiC2011/DrosophilaSingleCell2015/alternativeFiltering/hg19/K562_combined_refined.frag" a = HiCdataset("bla", os.path.join("../../data/hg19"), enzymeName=1000, inMemory=True) a.load(human) humanFrags = a.fragmentSum() > 0 a = HiCdataset("bla", os.path.join("../../data/mm9"), enzymeName=1000, inMemory=True) a.load(mouse) mouseFrags = a.fragmentSum() > 0 return humanFrags, mouseFrags
def getByChromosomeScaling(self): HD = HiCdataset(self.refined, self.getGenomeObject(), self.getEnzyme(), 1000, mode='r', tmpFolder="\tmp", dictToStoreIDs="h5dict") scals = {} for chrom in range(self.getGenomeObject().chrmCount): for arm in [0, 1]: if arm == 0: region = (chrom, 0, self.genomeObject.cntrMids[chrom]) else: region = (chrom, self.genomeObject.cntrMids[chrom], self.genomeObject.chrmLens[chrom]) scal = HD.plotScaling(excludeNeighbors=2, normalize=True, mindist=2000, regions=[region]) scals[(chrom, arm)] = scal return scals
def doScaling(dataset): genome = os.path.split(os.path.split(dataset)[0])[-1] a = HiCdataset("bla", os.path.join("../../data/", genome), enzymeName=1000, inMemory=True) a.load(dataset) # a.load("../hadjurCohesin2012/mm9/AST-WT-AdCre-R1-Hi ndIII_refined.frag") a.maskFilter((a.chrms1 == a.chrms2) * (a.strands1 == a.strands2)) sc = {} regions = [] regions2 = [] for chromosome in range(0, a.genome.chrmCount): cur = [(chromosome, 0, a.genome.cntrMids[chromosome]), (chromosome, a.genome.cntrMids[chromosome], a.genome.chrmLens[chromosome])] if chromosome % 2 == 0: regions += cur else: regions2 += cur frags = fragdict[genome] sc1 = a.plotScaling(excludeNeighbors=2, mindist=6000, regions=regions, plot=False, fragids1=frags, fragids2=frags) sc2 = a.plotScaling(excludeNeighbors=2, mindist=6000, regions=regions2, plot=False, fragids1=frags, fragids2=frags) sc3 = a.plotScaling(excludeNeighbors=2, mindist=6000, regions=regions + regions2, plot=False, fragids1=frags, fragids2=frags) return sc1, sc2, sc3
#filenames = [j for j in os.listdir(path) if j.endswith("SC35-R1-HaeIII_merged.frag")] filenames = [j for j in os.listdir(path) if j.endswith("_refined.frag")] genomeName="hg19" genome_db = getGenome(genomeName) for fl in sorted(filenames): print fl out_file = path + fl.split("_refined.frag")[0] + "_refined_wo_man_dupl" f1 = HiCdataset(path + fl.split("_refined.frag")[0] + "_refined_wo_man_dupl.frag",enzymeName = enzyme, genome=getGenome(workingGenome),tmpFolder = "tmp",dictToStoreIDs="h5dict", mode='w') f1.load(path + fl) #f1 = mirnylib.h5dict.h5dict(path + fl ,'r+') print len(f1.chrms1) cuts1 = list(f1.cuts1) cuts2 = list(f1.cuts2) chrms1 = list(f1.chrms1)
import subprocess import argparse import sys from mirnylib.genome import Genome from hiclib.fragmentHiC import HiCdataset if __name__ == "__main__": parser = argparse.ArgumentParser( description="Sort contacts by position and order the reads of each pair so that all " "contacts are upper triangular with respect to the chromosome ordering " "given by the chromsizes file." ) parser.add_argument("genome", help="hiclib genome path", metavar="GENOME_PATH") parser.add_argument("pairs", help="HDF5 hiclib read pairs file", metavar="PAIRS_PATH") args = vars(parser.parse_args()) genome_db = Genome(args["genome"]) infile = args["pairs"] if args["out"] is not None: outfile = args["out"] ds = HiCdataset(outfile, genome_db, "HindIII") ds.load(infile) ds._sortData() else: outfile = args["out"] ds = HiCdataset(infile, genome_db, "HindIII") ds._sortData()
def plotFigure2c(): TR = HiCdataset() TR.load("GM-all.refined") hm = TR.buildHeatmap(1, 1, 1000000, False, False) TR.calculateWeights() TR.weights = np.ones(len(TR.weights), float) # if you want to correct just by fragment density, not by length dependence hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True) hm2[np.isnan(hm2)] = 0 mask = np.sum(hm, axis=0) > 0 """p1-6 are 6 lines to be plotted, below is plotting only""" p1 = np.sum(hm, axis=0)[mask] p3 = np.sum(correct(hm), axis=0)[mask] p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask] p4 = np.sum(correct(hm2), axis=0)[mask] p2 = np.sum(hm2, axis=0)[mask] p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask] matplotlib.rcParams['font.sans-serif'] = 'Arial' dashstyle = (3, 3) plt.figure(figsize=(4, 4)) ax = plt.subplot(2, 1, 1) plt.xlim((0, 80)) plt.ylim((0, 2)) plt.ylabel("Total coverage", fontsize=8) line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0] line22 = plt.plot( p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0] line22.set_dashes(dashstyle) line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0] for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(8) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(8) legend = plt.legend([line21, line22, line23], ["Raw data", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2) legend.draw_frame(False) removeAxes(shift=0, ax=ax) for i in ax.spines.values(): i.set_color('none') ax.axhline(linewidth=1, color='black') ax.axvline(linewidth=1, color='black') ax2 = plt.subplot(2, 1, 2, sharex=ax) plt.xlim((0, 80)) plt.ylim((0, 2)) plt.xlabel("Position on chom 1 (MB)", fontsize=8) plt.ylabel("Total coverage", fontsize=8) line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0] line1.set_dashes(dashstyle) line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0] line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0] for xlabel_i in ax2.get_xticklabels(): xlabel_i.set_fontsize(8) for xlabel_i in ax2.get_yticklabels(): xlabel_i.set_fontsize(8) legend = plt.legend([line2, line1, line3], ["HindIII corrected", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2) legend.draw_frame(False) removeAxes(shift=0, ax=ax2) plotting.niceShow()
import numpy np = numpy from mirnylib import genome from mirnylib.numutils import adaptiveSmoothing, trunc, ultracorrect from mirnylib.h5dict import h5dict genomeDb = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) for expName in os.listdir( "caul"): # this directory contains folders with names of experiments. # data will be loaded from different folders TR = HiCdataset( "bla", genome=genomeDb, inMemory=True, ) # inMemory, as data are small (<1e8 reads) TR.load("data/" + expName + "_refined.frag") # load filtered data # Now save all heatmaps with different resolutions, etc. TR.saveHeatmap("data/" + expName + "-5k_overlap.hm", 5000, useFragmentOverlap=True) TR.saveHeatmap("data/" + expName + "-10k_overlap.hm", 10000, useFragmentOverlap=True) TR.saveHeatmap("data/" + expName + "-20k_overlap.hm", 20000, useFragmentOverlap=True) TR.saveHeatmap("data/" + expName + "-50k.hm", 50000)
from mirnylib.systemutils import fmap,setExceptionHook from mirnylib.genome import Genome import numpy as np import os import sys genomeName = "GalGal5filtered" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/" filename = "chunk0001.hdf5" TR = HiCdataset(basefolder+filename+".HiCdataset", genome=genome_db, maximumMoleculeLength=500,enzymeName = "HindIII",tmpFolder = "tmp", mode='w') # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=basefolder+filename) TR.filterDuplicates() TR.filterLarge(10000,10) TR.filterExtreme(cutH=0.001, cutL=0) TR.writeFilteringStats() TR.printMetadata(saveTo=basefolder+filename+".stat") TR.saveHeatmap(basefolder+filename+".hm-res-1000kb",1000000) comment =""" #------------------------End set of filters applied---------- print("----->Building Raw heatmap at different resolutions") TR.printStats() for res in wholeGenomeResolutionsKb: TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res*1000)
#Now merging different experiments alltogether #note that the first column is not here, as it is a replica experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames]) print experiments for experiment in experiments: workingGenome = experiment[1] myExperimentNames = [ i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1], experiment[2]) ] assert len(myExperimentNames) > 0 if len(myExperimentNames) > 1: #If we have more than one experiment (replica) for the same data, we can combine. TR = HiCdataset(os.path.join( workingGenome, "%s-all-%s_refined.frag" % (experiment[0], experiment[2])), genome=genomeFolder(workingGenome)) statSaveName = os.path.join( "statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2])) TR.merge(myExperimentNames) TR.printMetadata(saveTo=statSaveName) for res in wholeGenomeResolutionsKb: TR.saveHeatmap( os.path.join( workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res * 1000) for res in byChromosomeResolutionsKb: TR.saveByChromosomeHeatmap( os.path.join(
from mirnylib.genome import Genome import os resolution = 10000 genomeName = "mm10" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"]) data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/" fdataset_fname = "mESC-all-NcoI_refined.frag" setExceptionHook() print "Loading HiCdataset" TR = HiCdataset(data_folder + fdataset_fname, enzymeName="HindIII", mode='r', genome=genome_db) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "_res"+ "1000k.hm", 1000000) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "_res"+ "40k.hm", 40000) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "_res" + "10k.hm", 10000,useFragmentOverlap=True) for resolution in [10000, 25000]: print "Saving heatmap" TR.saveHiResHeatmapWithOverlaps( data_folder + fdataset_fname + "_res" + str(resolution / 1000) + "k_hiRes.hm", resolution) #TR.saveByChromosomeHeatmap(data_folder + fdataset_fname + "_res" + str(resolution/1000)+"k_bychr.hm", resolution=resolution,includeTrans=True)
out.write(f.split(".")[0] + "\t" + str(chr_len) + "\n") else: for i in xrange(genome_db.chrmCount): if genome_db.chrmCount > 100: out.write("N" + genome_db.idx2label[i] + "\t" + str(genome_db.chrmLens[i]) + "\n") else: out.write(genome_db.idx2label[i] + "\t" + str(genome_db.chrmLens[i]) + "\n") out.close() #Step 2. Load fragments and save it in juicybox format Fr = HiCdataset(fr_daraset + ".tmp" + str(random.randint(0, 10000)), enzymeName=enzyme, mode='w', genome=genome_db, inMemory=True) Fr.load(fr_daraset) out = open(out_file_prefix + ".contacts", "w") print "Reporting ", Fr.N, " contacts" dotsize = Fr.N / 500 print "one dot is ", dotsize, " contacts" strands1 = np.array(Fr._getVector("strands1"), dtype=np.int8) cuts1 = np.array(Fr._getVector("cuts1"), dtype=np.uint64) chrms1 = np.array(Fr._getVector("chrms1"), dtype=np.uint16) fragids1 = np.array(Fr._getVector("rfragAbsIdxs1"), dtype=np.uint32) strands2 = np.array(Fr._getVector("strands2"), dtype=np.int8)
def refine_dataset(filenames, niceval, delete=True, parse_in_memory=True): """ Map the fragments from each replicate to chromosomes (in parallel) Parameters ---------- filenames[0] is a list of filenames of incoming files filenames[1] is a folder for outgoing file filenames[2] is a working genome, that is output directory filenames[3] is an enzyme for a given experiment create : bool, optional If True, parse each file. If False, assume that files were already parsed (e.g. if you are just playing around with filtering parameters) delete : bool, optional If True, delete parsed files after merging. Man, these files may be huge... if you don't have a 10TB RAID, this may be useful. parseInMemory : bool, optional Perform parsing input files in memory. """ in_files = filenames[0] out_file = filenames[1] stat_folder = os.path.join("statistics", out_file) working_genome = filenames[2] enzyme = filenames[3] nice_list = [niceval for i in in_files] parse_list = [parse_in_memory for i in in_files] genome_list = [working_genome for i in in_files] enzyme_list = [enzyme for i in in_files] stat_folder_list = [stat_folder for i in in_files] #map(parse_onename, in_files) Parallel(n_jobs=20)(delayed(parse_mapped_reads)(infile, nice_val, genome, enzyme, stat_folder, parse_val) for infile, nice_val, genome, enzyme, stat_folder, parse_val in zip(in_files, nice_list, genome_list, enzyme_list, stat_folder_list, parse_list)) # Merge in all parsed files from one experiment print("Merging files all together, applying filters...") TR = HiCdataset(ensure(out_file + "_merged.frag"), genome=genomeFolder(working_genome), enzymeName=enzyme, tmpFolder="tmp", dictToStoreIDs="h5dict", mode="w") TR.merge([i + "_parsed.frag" for i in in_files]) if delete: # cleaning up parsed files for delFile in [i + "_parsed.frag" for i in in_files]: os.remove(delFile) print("done!") print("Filtering merged data...") TR = HiCdataset(out_file + "_refined.frag", enzymeName=enzyme, genome=genomeFolder(working_genome), tmpFolder="tmp", dictToStoreIDs="h5dict", mode='w') TR.load(out_file + "_merged.frag") # ----------------------------Set of filters applied ------------- TR.filterDuplicates() TR.filterLarge(10000, 10) TR.filterExtreme(cutH=0.001, cutL=0) TR.writeFilteringStats() TR.printMetadata(saveTo=stat_folder + ".stat") print("done!") # ------------------------End set of filters applied---------- print("Building heatmaps at specified resolutions...") TR.printStats() for res in whole_genome_resolutions_Kb: TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res * 1000) for res in by_chromosome_resolutions_Kb: TR.saveByChromosomeHeatmap(out_file + "-{0}k.byChr".format(res), res * 1000) for res in hi_res_with_overlap_resolutions_Kb: TR.saveHiResHeatmapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res * 1000) for res in super_hi_res_with_overlap_resolutions_Kb[:-skip]: TR.saveSuperHighResMapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res * 1000) print("done!")
def parse_mapped_reads(onename, niceval, working_genome, enzyme, stat_folder, in_memory): """ Parse the given h5 mapped reads, output to a partial fragment file. :param onename: (string) the name of this fragment :param niceval: (int) positive int to rank the priority of this job :param working_genome: (string) name of the genome aganist which we mapped the reads :param enzyme: (string) name of the restriction enzyme used to cut fragments :param stat_folder: (string) folder in which to write fragment mapping stats :return: none explicit, h5 file is saved to disk """ # set the niceness of this sub-process: os.nice(niceval) np.random.seed() # Parsing individual files, either in memory or on disk if in_memory: finalname = onename + "_parsed.frag" TR = HiCdataset("bla" + str(np.random.randint(100000000000)), genome=genomeFolder(working_genome), maximumMoleculeLength=500, enzymeName=enzyme, tmpFolder="tmp", inMemory=True) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename) print onename TR.save(ensure(finalname)) folder, fname = os.path.split(onename) statSubFolder = os.path.join(stat_folder, folder) TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat"))) else: # Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=genomeFolder(working_genome), enzymeName=enzyme, tmpFolder="tmp", maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata(saveTo=ensure(os.path.join(stat_folder, onename + ".stat")))
def refineDataset(filenames, create=True, delete=True, parseInMemory=True): """ Parameters ---------- filenames[0] is a list of filenames of incoming files filenames[1] is a folder for outgoing file filenames[2] is a working genome name, which is also the name of output directory create : bool, optional If True, parse each file. If False, assume that files were already parsed (e.g. if you are just playing around with filtering parameters) delete : bool, optional If True, delete parsed files after merging. Man, these files may be huge... if you don't have a 10TB RAID, this may be useful. parseInMemory : bool, optional Perform parsing input files in memory. """ in_files = filenames[0] out_file = filenames[1] workingGenome = filenames[2] if os.path.exists(workingGenome) == False: try: os.mkdir(workingGenome) except: print "Cannot create working directory" exit() if create == True: # if we need to parse the input files (.hdf5 from mapping). for onename in in_files: #Parsing individual files if not os.path.exists(source(onename)): raise StandardError("path not found: %s" % onename) if parseInMemory == True: #create dataset in memory, parse and then save to destination TR = HiCdataset("bla", genome=genomeFolder(workingGenome), maximumMoleculeLength=500, override=True, inMemory=True) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=source(onename)) TR.save(onename + "_parsed.frag") else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(onename + "_parsed.frag", genome=genomeFolder(workingGenome), maximumMoleculeLength=500, override=True) TR.parseInputData(dictLike=source(onename)) "Merging files alltogether, applying filters" TR = HiCdataset(out_file + "_merged.frag", genome=genomeFolder(workingGenome), override=True) TR.merge([i + "_parsed.frag" for i in in_files]) #Merge in all parsed files from one experiment if delete == True: # cleaning up parsed files for delFile in [i + "_parsed.frag" for i in in_files]: os.remove(delFile) TR.flush() "Now opening new dataset for refined data, and performing all the filtering " TR = HiCdataset(out_file + "_refined.frag", genome=genomeFolder(workingGenome), override=True) TR.load(out_file + "_merged.frag") #----------------------------Set of filters applied ------------- TR.filterRsiteStart(offset=5) TR.filterDuplicates() #TR.save(out_file+".dat") TR.filterLarge() TR.filterExtreme(cutH=0.005, cutL=0) #------------------------End set of filters applied---------- else: #If merging & filters has already been done, just load files TR = HiCdataset(out_file + "_working.frag", override=True, genome=genomeFolder(workingGenome)) TR.load(out_file + "_refined.frag") print "----->Building Raw heatmap at two resolutions" TR.printStats() TR.saveHeatmap(out_file + "-200k.hm", 200000) TR.saveHeatmap(out_file + "-500k.hm", 500000) TR.saveHeatmap(out_file + "-1M.hm", 1000000) TR.saveHeatmap(out_file + "-2M.hm", 2000000) print "----->Building RB heatmap" TR = HiCdataset(out_file + "_breaks.frag", genome=genomeFolder( workingGenome), override=True) TR.load(out_file + "_refined.frag") TR.maskFilter((TR.dists1 > TR.maximumMoleculeLength) + (TR.dists2 > TR.maximumMoleculeLength) * TR.DS) TR.printStats() TR.saveHeatmap(out_file + "-200k-breaks.hm", 200000) TR.saveHeatmap(out_file + "-500k-breaks.hm", 500000) TR.saveHeatmap(out_file + "-1M-breaks.hm", 1000000) TR.saveHeatmap(out_file + "-2M-breaks.hm", 2000000)
#Now running refineDataset for each experiment for i in byExperiment: refineDataset(i, create=True, delete=True) #Now merging different experiments alltogether experiments = set([(i[0], i[2]) for i in newExperimentNames]) for experiment in experiments: workingGenome = experiment[1] myExperimentNames = [i[1] + "_refined.frag" for i in newExperimentNames if i[0] == experiment[0]] assert len(myExperimentNames) > 0 if len(myExperimentNames) > 1: TR = HiCdataset(os.path.join(workingGenome, "%s-all_refined.frag" % experiment[0]), genome=genomeFolder(workingGenome)) TR.merge(myExperimentNames) TR.saveHeatmap(os.path.join( workingGenome, "%s-all-100k.hm" % experiment[0]), 100000) TR.saveHeatmap(os.path.join( workingGenome, "%s-all-200k.hm" % experiment[0]), 200000) TR.saveHeatmap(os.path.join( workingGenome, "%s-all-500k.hm" % experiment[0]), 500000) TR.saveHeatmap(os.path.join( workingGenome, "%s-all-1M.hm" % experiment[0]), 1000000) #map(refine_paper, # [((source("SRR027961"), # source("SRR027960")), os.path.join(workingGenome, "GM-NcoI-%s" % workingGenome ),"NcoI"),
chrms1 = np.random.randint(0,22,N) chrms2 = chrms1.copy() mask = np.random.random(N) < 0.5 chrms2[mask] = np.random.randint(0,22,mask.sum()) pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int) offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 ) pos2 = np.array(pos1 + offset1, dtype=int) strands1 = np.random.random(N) > 0.5 strands2 = np.random.random(N) > 0.5 mydict = {"chrms1":chrms1, "chrms2":chrms2,"cuts1":pos1,"cuts2":pos2,"strands1":strands1,"strands2":strands2} TR = HiCdataset("bla", genome=mygenome, enzymeName="MboI",maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=mydict, enzymeToFillRsites="MboI") TR.filterLarge(cutlarge=50000, cutsmall=100) sc = TR.plotScaling() print sc plt.title("Scaling should be 1/x") plt.plot(*sc) plt.xscale("log") plt.yscale("log") plt.show()
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII", maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag", enzymeName="HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder, enzymeName="HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize=30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format( i, mdata[i], TR.metadata[i]) stop = True if stop == True: print("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap(filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1] + "-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1] + "-5k-SuperHighRes.hm", resolution=5000, chromosomes=[14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i, i] = 2 * newchrom1[i, i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb, 20, True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ( (removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
""" from hiclib.fragmentHiC import HiCdataset import os from mirnylib import genome genomeDb = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) for expName in os.listdir("caul"): TR = HiCdataset( "bla", genome=genomeDb, inMemory=True, ) # inMemory, as files are probably small (less than hundreds mililon reads) TR.parseInputData("caul/" + expName, removeSS=True) # We discard SS in our pipeline now TR.printMetadata() TR.filterRsiteStart( offset=5 ) # We still do this filter to avoid strange "dangling end-like" molecules TR.filterDuplicates() #TR.save(out_file+".dat") TR.filterLarge( cutlarge=300000, cutsmall=100 ) #Don't filter any large fragments. This was relevant for eucaryotes with #their megabase-long stretches of repetitive or unmappable regions #TR.filterExtreme(cutH=0.0025, cutL=0) #ALl fragments in Caulobacter seemed to behave normally
pass #Now merging different experiments alltogether #note that the first column is not here, as it is a replica experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames]) print(experiments) for experiment in experiments: workingGenome = experiment[1] myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])] assert len(myExperimentNames) > 0 if len(myExperimentNames) > 0: #If we have more than one experiment (replica) for the same data, we can combine. TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" % (experiment[0],experiment[2])), genome=getGenome(workingGenome), enzymeName = experiment[2],tmpFolder = "tmp",dictToStoreIDs="h5dict") statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2])) TR.merge(myExperimentNames) TR.printMetadata(saveTo=statSaveName) for res in wholeGenomeResolutionsKb: TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000) for res in byChromosomeResolutionsKb: TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000) for res in HiResWithOverlapResolutionsKb: TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000) for res in SuperHiResWithOverlapResolutionsKb: TR.saveSuperHighResMapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_SuperHighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
def plotFigure2c(): TR = HiCdataset() TR.load("GM-all.refined") hm = TR.buildHeatmap(1, 1, 1000000, False, False) TR.calculateWeights() TR.weights = np.ones( len(TR.weights), float ) # if you want to correct just by fragment density, not by length dependence hm2 = TR.buildHeatmap(1, 1, 1000000, False, weights=True) hm2[np.isnan(hm2)] = 0 mask = np.sum(hm, axis=0) > 0 """p1-6 are 6 lines to be plotted, below is plotting only""" p1 = np.sum(hm, axis=0)[mask] p3 = np.sum(correct(hm), axis=0)[mask] p5 = np.sum(ultracorrect(hm, 40), axis=0)[mask] p4 = np.sum(correct(hm2), axis=0)[mask] p2 = np.sum(hm2, axis=0)[mask] p6 = np.sum(ultracorrect(hm2, 40), axis=0)[mask] matplotlib.rcParams['font.sans-serif'] = 'Arial' dashstyle = (3, 3) plt.figure(figsize=(4, 4)) ax = plt.subplot(2, 1, 1) plt.xlim((0, 80)) plt.ylim((0, 2)) plt.ylabel("Total coverage", fontsize=8) line21 = plt.plot(p1 / p1.mean(), "-", linewidth=1, color="#e5a826")[0] line22 = plt.plot(p3 / p3.mean(), "--", linewidth=1, color="#e5a826")[0] line22.set_dashes(dashstyle) line23 = plt.plot(p5 / p5.mean(), linewidth=1, color="grey")[0] for xlabel_i in ax.get_xticklabels(): xlabel_i.set_fontsize(8) for xlabel_i in ax.get_yticklabels(): xlabel_i.set_fontsize(8) legend = plt.legend( [line21, line22, line23], ["Raw data", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2) legend.draw_frame(False) removeAxes(shift=0, ax=ax) for i in ax.spines.values(): i.set_color('none') ax.axhline(linewidth=1, color='black') ax.axvline(linewidth=1, color='black') ax2 = plt.subplot(2, 1, 2, sharex=ax) plt.xlim((0, 80)) plt.ylim((0, 2)) plt.xlabel("Position on chom 1 (MB)", fontsize=8) plt.ylabel("Total coverage", fontsize=8) line1 = plt.plot(p4 / p4.mean(), "--", color="#9b3811", linewidth=1)[0] line1.set_dashes(dashstyle) line2 = plt.plot(p2 / p2.mean(), "-", color="#9b3811", linewidth=1)[0] line3 = plt.plot(p6 / p6.mean(), linewidth=1, color="grey")[0] for xlabel_i in ax2.get_xticklabels(): xlabel_i.set_fontsize(8) for xlabel_i in ax2.get_yticklabels(): xlabel_i.set_fontsize(8) legend = plt.legend( [line2, line1, line3], ["HindIII corrected", "Single correction", "Iterative correction"], prop={"size": 6}, loc=1, handlelength=2) legend.draw_frame(False) removeAxes(shift=0, ax=ax2) plotting.niceShow()
chrms1 = np.random.randint(0,22,N) chrms2 = chrms1.copy() mask = np.random.random(N) < 0.5 chrms2[mask] = np.random.randint(0,22,mask.sum()) pos1 = np.array(np.array((0.1 + 0.8 * np.random.random(N)) * mygenome.chrmLens[chrms1]), dtype=int) offset1 = np.exp(3 + np.random.random(N) * (np.log(1000000) - 3)) * (2 * (np.random.random(N)>0.5) - 1 ) pos2 = np.array(pos1 + offset1, dtype=int) strands1 = np.random.random(N) > 0.5 strands2 = np.random.random(N) > 0.5 mydict = {"chrms1":chrms1, "chrms2":chrms2,"cuts1":pos1,"cuts2":pos2,"strands1":strands1,"strands2":strands2} TR = HiCdataset("bla", genome=mygenome, enzymeName="MboI",maximumMoleculeLength=500, inMemory=True) print("\nTesting loading new data without rsite information ") TR.parseInputData(dictLike=mydict, enzymeToFillRsites="MboI") TR.filterLarge(cutlarge=50000, cutsmall=100) sc = TR.plotScaling() g = mygenome print(sc) plt.title("Scaling should be 1/x") plt.plot(*sc) plt.xscale("log") plt.yscale("log")
def refineDataset(filenames, create=True, delete=False, parseInMemory=True): """ Parameters ---------- filenames[0] is a list of filenames of incoming files filenames[1] is a folder for outgoing file filenames[2] is a working genome, that is output directory filenames[3] is an enzyme for a given experiment create : bool, optional If True, parse each file. If False, assume that files were already parsed (e.g. if you are just playing around with filtering parameters) delete : bool, optional If True, delete parsed files after merging. Man, these files may be huge... if you don't have a 10TB RAID, this may be useful. parseInMemory : bool, optional Perform parsing input files in memory. """ in_files = filenames[0] out_file = filenames[1] statFolder = os.path.join("statistics", out_file) workingGenome = filenames[2] enzyme = filenames[3] if create == True: # if we need to parse the input files (.hdf5 from mapping). def parse_onename(onename): np.random.seed() #Parsing individual files if parseInMemory == True: finalname = onename + "_parsed.frag" #if not os.path.exists(finalname): if True: #create dataset in memory, parse and then save to destination TR = HiCdataset( "bla" + str(np.random.randint(100000000000)), genome=getGenome(workingGenome), maximumMoleculeLength=500, enzymeName=enzyme, tmpFolder="tmp", inMemory=True ) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename) folder = os.path.split(onename)[0] print(onename) TR.save(ensure(finalname)) folder, fname = os.path.split(onename) statSubFolder = os.path.join(statFolder, folder) TR.printMetadata(saveTo=ensure( os.path.join(statSubFolder, fname + ".stat"))) else: print("skipping parsed: ", onename) else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=getGenome(workingGenome), enzymeName=enzyme, tmpFolder="tmp", maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata( saveTo=ensure(os.path.join(statFolder, onename + ".stat"))) list(map(parse_onename, in_files)) "Merging files alltogether, applying filters" TR = HiCdataset(ensure(out_file + "_merged.frag"), genome=getGenome(workingGenome), enzymeName=enzyme, tmpFolder="tmp", dictToStoreIDs="h5dict", mode="w") TR.merge([i + "_parsed.frag" for i in in_files]) #Merge in all parsed files from one experiment if delete == True: # cleaning up parsed files for delFile in [i + "_parsed.frag" for i in in_files]: os.remove(delFile) "Now opening new dataset for refined data, and performing all the filtering " TR = HiCdataset(out_file + "_refined.frag", enzymeName=enzyme, genome=getGenome(workingGenome), tmpFolder="tmp", dictToStoreIDs="h5dict", mode='w') TR.load(out_file + "_merged.frag") #----------------------------Set of filters applied ------------- TR.filterDuplicates() #TR.save(out_file+".dat") #TR.filterExtreme(cutH=0.0001, cutL=0) #TR.filterRsiteStart() #TR.filterLarge() TR.writeFilteringStats() TR.printMetadata(saveTo=statFolder + ".stat") #------------------------End set of filters applied---------- else: #If merging & filters has already been done, just load files TR = HiCdataset(out_file + "_working.frag", enzymeName=enzyme, mode='w', genome=getGenome(workingGenome)) TR.load(out_file + "_refined.frag") TR.printMetadata(saveTo=statFolder + ".stat") print("----->Building Raw heatmap at different resolutions") TR.printStats() for res in coolerResolutions: TR.saveCooler(out_file + ".{0}.cool".format(res), res)
def doOne(inData): coolResolutions = [ 10000000, 5000000, 2000000, 1000000, 500000, 200000, 100000, 40000, 20000, 10000, 5000, 2000, 1000 ] i, j = inData if i == "": return # if i not in ["total", "pronuc", "K562"]: # continue print(i) gens = j["genome"] if len(gens) == 0: print("Genome not found") return genome = gens.values[0] out_file = "{1}/{0}_combined".format(i, genome) # if os.path.exists(out_file + "-10k_HighRes.byChr"): # continue mygen = "/home/magus/HiC2011/data/{0}".format(genome) filenames = [ "{1}/{0}_refined.frag".format(s, genome) for s in j["filenames"].values ] # assert False not in list(map(os.path.exists, filenames)) filenames = [i for i in filenames if os.path.exists(i)] if len(filenames) == 0: print("No filenames found!") return TR = HiCdataset("bla", mygen, "DpnII", inMemory=True, tmpFolder="/tmp", dictToStoreIDs="dict") TR.merge(filenames) TR.setSimpleHighResHeatmap() TR.writeFilteringStats() TR.printMetadata( saveTo="statistics/{1}/{0}_combined.frag".format(i, genome)) pickle.dump( TR.metadata, open("statistics/{1}/{0}_combined.pkl".format(i, genome), 'wb')) TR.save("{1}/{0}_combined_refined.frag".format(i, genome)) for res in coolResolutions: TR.saveCooler(out_file + ".{0}.cool".format(res), res)
from hiclib.fragmentHiC import HiCdataset from mirnylib.systemutils import fmap, setExceptionHook from mirnylib.genome import Genome import os #from defineGenome import getGenome genomeName = "mm10" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"]) data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/" fdataset_fname = "mESC-all-HindIII_refined.frag" setExceptionHook() print "Loading HiCdataset" TR = HiCdataset(data_folder + fdataset_fname, enzymeName="HindIII", mode='r', genome=genome_db) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000) print "Saving heatmap" TR.saveHeatmap(data_folder + fdataset_fname + "25k.hm", 25000, useFragmentOverlap=True) #print "Saving Hi-Res heatmap" #TR.saveHiResHeatmapWithOverlaps(data_folder + fdataset_fname + "10k_hiRes.hm", 10000)
def doSupplementaryCoveragePlot(): TR = HiCdataset() TR.load("GM-all.refined") s1 = TR.fragmentSum(strands=1) TR.saveFragments() TR.maskFilter(TR.dists1 > 500) TR.originalFragments() s2 = TR.fragmentSum(strands=1) resolution = 1000000 def coverage(s1, s2, TR): genome = Genome() genome.createMapping(resolution) label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + ( TR.ufragments % TR.fragIDmult) / resolution counts = np.bincount(label, weights=s1) counts2 = np.bincount(label, weights=s2) data = cPickle.load(open("GC1M", 'rb')) eigenvector = np.zeros(genome.chromosomeEnds[-1], float) inds = np.argsort(counts) mask = inds[int(0.02 * len(inds)):] for chrom in range(1, 24): eigenvector[genome.chromosomeStarts[chrom - 1]:genome. chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1] eigenvector[eigenvector < 35] = 35 plt.scatter(counts[mask], counts2[mask], c=eigenvector[mask], s=6, linewidth=0) print stats.spearmanr(counts[mask], counts2[mask]) plt.xlabel("Coverage from all reads") plt.xticks([0, 5000, 10000, 15000]) plt.ylabel("Coverage from RBs") b = plt.colorbar() b.ax.set_xlabel("GC content") plt.subplot(121) plt.title("HinIII") coverage(s1, s2, TR) TR = HiCdataset() TR.load("GM-NcoI.refined") s1 = TR.fragmentSum(strands=1) TR.saveFragments() TR.maskFilter(TR.dists1 > 500) TR.originalFragments() s2 = TR.fragmentSum(strands=1) resolution = 1000000 plt.subplot(122) plt.title("NcoI") coverage(s1, s2, TR) plt.show()
#Now running refineDataset for each experiment for i in byExperiment: print i refineDataset(i, create=True, delete=True) #Now merging different experiments alltogether #note that the first column is not here, as it is a replica experiments = set([(i[0], i[2], i[3]) for i in combinedExperimentNames]) print experiments for experiment in experiments: workingGenome = experiment[1] myExperimentNames = [i[1] + "_refined.frag" for i in combinedExperimentNames if (i[0], i[2], i[3]) == (experiment[0], experiment[1],experiment[2])] assert len(myExperimentNames) > 0 if len(myExperimentNames) > 1: #If we have more than one experiment (replica) for the same data, we can combine. TR = HiCdataset(os.path.join(workingGenome, "%s-all-%s_refined.frag" % (experiment[0],experiment[2])), genome=genomeFolder(workingGenome)) statSaveName = os.path.join("statistics", workingGenome, "%s-all-%s_refined.stat" % (experiment[0], experiment[2])) TR.merge(myExperimentNames) TR.printMetadata(saveTo=statSaveName) for res in wholeGenomeResolutionsKb: TR.saveHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.hm" % (experiment[0], experiment[2])).format(res), res*1000) for res in byChromosomeResolutionsKb: TR.saveByChromosomeHeatmap(os.path.join(workingGenome, "%s-all-%s-{0}k.byChr" % (experiment[0], experiment[2])).format(res), res*1000) for res in HiResWithOverlapResolutionsKb: TR.saveHiResHeatmapWithOverlaps(os.path.join(workingGenome, "%s-all-%s-{0}k_HighRes.byChr" % (experiment[0], experiment[2])).format(res), res*1000)
def parse_onename(onename): np.random.seed() #Parsing individual files if parseInMemory == True: finalname = onename + "_parsed.frag" #if not os.path.exists(finalname): if True: #create dataset in memory, parse and then save to destination TR = HiCdataset( "bla" + str(np.random.randint(100000000000)), genome=getGenome(workingGenome), maximumMoleculeLength=500, enzymeName=enzyme, tmpFolder="tmp", inMemory=True ) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename) folder = os.path.split(onename)[0] print(onename) TR.save(ensure(finalname)) folder, fname = os.path.split(onename) statSubFolder = os.path.join(statFolder, folder) TR.printMetadata(saveTo=ensure( os.path.join(statSubFolder, fname + ".stat"))) else: print("skipping parsed: ", onename) else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=getGenome(workingGenome), enzymeName=enzyme, tmpFolder="tmp", maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata( saveTo=ensure(os.path.join(statFolder, onename + ".stat")))
from mirnylib.systemutils import fmap, setExceptionHook from mirnylib.genome import Genome import os #from defineGenome import getGenome genomeName = "mm10" genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"]) data_folder = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/" fdataset_fname = "mESC-all-HindIII_refined.frag" setExceptionHook() print "Loading HiCdataset" TR = HiCdataset(data_folder + fdataset_fname, enzymeName="HindIII", mode='r', genome=genome_db) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "1000k.hm", 1000000) #print "Saving heatmap" #TR.saveHeatmap(data_folder + fdataset_fname + "40k.hm", 40000) print "Saving heatmap" TR.saveHeatmap(data_folder + fdataset_fname + "15k.hm", 15000, useFragmentOverlap=True) print "Saving Hi-Res heatmap" TR.saveHiResHeatmapWithOverlaps(data_folder + fdataset_fname + "10k_hiRes.hm", 10000)
def doSupplementaryCoveragePlot(): TR = HiCdataset() TR.load("GM-all.refined") s1 = TR.fragmentSum(strands=1) TR.saveFragments() TR.maskFilter(TR.dists1 > 500) TR.originalFragments() s2 = TR.fragmentSum(strands=1) resolution = 1000000 def coverage(s1, s2, TR): genome = Genome() genome.createMapping(resolution) label = genome.chromosomeStarts[TR.ufragments / TR.fragIDmult - 1] + (TR.ufragments % TR.fragIDmult) / resolution counts = np.bincount(label, weights=s1) counts2 = np.bincount(label, weights=s2) data = cPickle.load(open("GC1M", 'rb')) eigenvector = np.zeros(genome.chromosomeEnds[-1], float) inds = np.argsort(counts) mask = inds[int(0.02 * len(inds)):] for chrom in range(1, 24): eigenvector[genome.chromosomeStarts[chrom - 1]:genome.chromosomeStarts[chrom - 1] + len(data[chrom - 1])] = data[chrom - 1] eigenvector[eigenvector < 35] = 35 plt.scatter(counts[mask], counts2[mask], c=eigenvector[ mask], s=6, linewidth=0) print stats.spearmanr(counts[mask], counts2[mask]) plt.xlabel("Coverage from all reads") plt.xticks([0, 5000, 10000, 15000]) plt.ylabel("Coverage from RBs") b = plt.colorbar() b.ax.set_xlabel("GC content") plt.subplot(121) plt.title("HinIII") coverage(s1, s2, TR) TR = HiCdataset() TR.load("GM-NcoI.refined") s1 = TR.fragmentSum(strands=1) TR.saveFragments() TR.maskFilter(TR.dists1 > 500) TR.originalFragments() s2 = TR.fragmentSum(strands=1) resolution = 1000000 plt.subplot(122) plt.title("NcoI") coverage(s1, s2, TR) plt.show()
""" This scripts is a rip-off of a large mergeDatasets script with certain adjustments. Follow comments along the text. """ from hiclib.fragmentHiC import HiCdataset import os from mirnylib import genome genomeDb = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) for expName in os.listdir("caul"): TR = HiCdataset("bla", genome=genomeDb, inMemory=True,) # inMemory, as files are probably small (less than hundreds mililon reads) TR.parseInputData("caul/" + expName, removeSS=True) # We discard SS in our pipeline now TR.printMetadata() TR.filterRsiteStart(offset=5) # We still do this filter to avoid strange "dangling end-like" molecules TR.filterDuplicates() #TR.save(out_file+".dat") TR.filterLarge(cutlarge=300000, cutsmall=100) #Don't filter any large fragments. This was relevant for eucaryotes with #their megabase-long stretches of repetitive or unmappable regions #TR.filterExtreme(cutH=0.0025, cutL=0) #ALl fragments in Caulobacter seemed to behave normally TR.writeFilteringStats() TR.printMetadata(saveTo=statFolder + ".stat") TR.save("data/" + expName + "_refined.frag") #Saving filtered dataset #Below, saving all datasets at different resolutions. #Also, using new feature - fragment overlaps - which assins reads to all bins the fragment crosses. TR.saveHeatmap("data/" + expName + "-5k_overlap.hm", 5000, useFragmentOverlap=True)
def refineDataset(filenames, create=True, delete=True, parseInMemory=True): """ Parameters ---------- filenames[0] is a list of filenames of incoming files filenames[1] is a folder for outgoing file filenames[2] is a working genome, that is output directory filenames[3] is an enzyme for a given experiment create : bool, optional If True, parse each file. If False, assume that files were already parsed (e.g. if you are just playing around with filtering parameters) delete : bool, optional If True, delete parsed files after merging. Man, these files may be huge... if you don't have a 10TB RAID, this may be useful. parseInMemory : bool, optional Perform parsing input files in memory. """ in_files = filenames[0] out_file = filenames[1] statFolder = os.path.join("statistics", out_file) workingGenome = filenames[2] enzyme = filenames[3] if create == True: # if we need to parse the input files (.hdf5 from mapping). for onename in in_files: #Parsing individual files if parseInMemory == True: #create dataset in memory, parse and then save to destination TR = HiCdataset("bla", genome=genomeFolder(workingGenome), maximumMoleculeLength=500, inMemory=True) # remove inMemory if you don't have enough RAM TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) folder = os.path.split(onename)[0] print onename TR.save(ensure(onename + "_parsed.frag")) folder, fname = os.path.split(onename) statSubFolder = os.path.join(statFolder, folder) TR.printMetadata(saveTo=ensure(os.path.join(statSubFolder, fname + ".stat"))) else: #Create dataset at destination, parse on HDD, then no need to save. TR = HiCdataset(ensure(onename + "_parsed.frag"), genome=genomeFolder(workingGenome), maximumMoleculeLength=500, mode='w') TR.parseInputData(dictLike=onename, enzymeToFillRsites=enzyme) TR.printMetadata(saveTo=ensure(os.path.join(statFolder, onename + ".stat"))) "Merging files alltogether, applying filters" TR = HiCdataset(ensure(out_file + "_merged.frag"), genome=genomeFolder(workingGenome), mode="w") TR.merge([i + "_parsed.frag" for i in in_files]) #Merge in all parsed files from one experiment if delete == True: # cleaning up parsed files for delFile in [i + "_parsed.frag" for i in in_files]: os.remove(delFile) "Now opening new dataset for refined data, and performing all the filtering " TR = HiCdataset(out_file + "_refined.frag", genome=genomeFolder(workingGenome), mode='w') TR.load(out_file + "_merged.frag") #----------------------------Set of filters applied ------------- TR.filterRsiteStart(offset=5) TR.filterDuplicates() #TR.save(out_file+".dat") TR.filterLarge() TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() TR.printMetadata(saveTo=statFolder + ".stat") #------------------------End set of filters applied---------- else: #If merging & filters has already been done, just load files TR = HiCdataset(out_file + "_working.frag", mode='w', genome=genomeFolder(workingGenome)) TR.load(out_file + "_refined.frag") TR.printMetadata(saveTo=statFolder + ".stat") print "----->Building Raw heatmap at different resolutions" TR.printStats() for res in wholeGenomeResolutionsKb: TR.saveHeatmap(out_file + "-{0}k.hm".format(res), res*1000) for res in byChromosomeResolutionsKb: TR.saveByChromosomeHeatmap(out_file + "-{0}k.byChr".format(res), res*1000) for res in HiResWithOverlapResolutionsKb: TR.saveHiResHeatmapWithOverlaps(out_file + "-{0}k_HighRes.byChr".format(res), res*1000)
def heatmapFromHotFragments(dataset="../../../mouse/data/combined/"\ "mouse1_merged.frag", workingFile="../../../tcc/working/workingMouse.frag", cacheFile="../../../tcc/working/workingMouseFiltered.frag", genomeFolder="../../../data/mm9", label=""): mirnylib.systemutils.setExceptionHook() if not os.path.exists(cacheFile): FH = HiCdataset(workingFile, genomeFolder) FH.load(dataset) FH.filterRsiteStart(offset=5) FH.filterDuplicates() #TR.save(filename[1]+".dat") FH.filterLarge() FH.maskFilter(FH.DS) FH.save(cacheFile) FH = HiCdataset(workingFile, genomeFolder) FH.load(cacheFile) fs = FH.fragmentSum() p996, p998 = numpy.percentile(fs, [99.6, 99.8]) frag96 = FH.ufragments[fs > p996] frag98 = FH.ufragments[(fs > p998)] FH.maskFilter(arrayInArray( FH.fragids1, frag96) + arrayInArray(FH.fragids2, frag96)) hm98100 = FH.buildAllHeatmap(5000000) FH.maskFilter(arrayInArray( FH.fragids1, frag98) + arrayInArray(FH.fragids2, frag98)) hm99100 = FH.buildAllHeatmap(5000000) plt.subplot(121) plt.imshow(numpy.log(hm99100 + 1), interpolation="nearest") plt.colorbar() plt.title("log # counts for top .2 % of fragments, " + label) plt.subplot(122) plt.imshow(numpy.log(hm98100 - hm99100 + 1), interpolation="nearest") plt.title("log # counts for second top .2% (.996 - .998), " + label) plt.colorbar() plt.show()
def refine_paper(filename, create=True): """filename[0] is a list of filenames of incoming files filename[1] is a folder for outgoing file""" if create == True: for onename in filename[0]: #Parsing individual files if not os.path.exists(onename): raise StandardError("path not found: %s" % onename) TR = HiCdataset("bla", genome=genomeFolder, enzymeName="HindIII",maximumMoleculeLength=500, inMemory=True) print "\nTesting loading new data without rsite information " TR.parseInputData(dictLike=onename, enzymeToFillRsites="HindIII") #assert len(TR.DS) == 856143 #assert len(TR.ufragments) == 634572 TR.save(onename + "_parsed.frag") #Merging files alltogether, applying filters TR = HiCdataset(filename[1] + "_merged.frag",enzymeName = "HindIII", genome=genomeFolder, mode="w") TR.merge([i + "_parsed.frag" for i in filename[0]]) TR = HiCdataset("refined", genome=genomeFolder,enzymeName = "HindIII", mode="w", inMemory=True) print "\nTesting chunking during all tests" TR.chunksize = 30000 #because we do many operations, we disable autoFlush here TR.load(filename[1] + "_merged.frag") print "\nTesting Rsite filter" TR.filterRsiteStart(offset=5) #assert len(TR.DS) == 832110 print "\nTesting duplicate filter" TR.filterDuplicates(chunkSize = 30000) #assert len(TR.DS) == 830275 print "\nTesting small/large and extreme fragment filter" TR.filterLarge() #assert len(TR.DS) == 825442 TR.filterExtreme(cutH=0.005, cutL=0) TR.writeFilteringStats() #assert len(TR.DS) == 803845 #------------------------------------------- TR.printMetadata(saveTo="metadata") import cPickle stop = False mdata = cPickle.load(open("sampleMetadata")) for i in sorted(mdata.keys()): if TR.metadata[i] != mdata[i]: print "Key {0} is not consistent: should be {1}, is {2}".format(i, mdata[i], TR.metadata[i]) stop = True if stop == True: print ("""------------_ERROR_-------------- Inconsistent metadata: see above ----------------------------------------""") raise ValueError("Inconsistent Metadata") print "Testing allxall and by-chromosome heatmap counting diagonal twice" print "----> saving allxall heatmap" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="twice") a = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = a["heatmap"][st:end, st:end] chrom12 = a["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M.hm", resolution=1000000, includeTrans=True, countDiagonalReads="twice") b = h5dict(filename[1] + "-1M.hm")["1 1"] bb = h5dict(filename[1] + "-1M.hm")["1 2"] assert (b - chrom1).sum() == 0 print "Cis heatmap consistent" assert (bb - chrom12).sum() == 0 print 'Trans heatmap consistent' print a["heatmap"][::10, ::10].sum() #assert a["heatmap"][::10, ::10].sum() == 21800 print "Heatmap sum correct\n" #--------------------------------- print "Testing allxall and by-chromosome heatmap counting diagonal once" TR.saveHeatmap(filename[1] + "-1M.hm", 1000000, countDiagonalReads="once") Ta = h5dict(filename[1] + "-1M.hm") st, end = TR.genome.chrmStartsBinCont[1], TR.genome.chrmEndsBinCont[1] st2, end2 = TR.genome.chrmStartsBinCont[2], TR.genome.chrmEndsBinCont[2] chrom1 = Ta["heatmap"][st:end, st:end] chrom12 = Ta["heatmap"][st:end, st2:end2] setExceptionHook() print "----> saving by chromosome heatmap" TR.saveByChromosomeHeatmap( filename[1] + "-1M-byChr.hm", resolution=1000000, includeTrans=True, countDiagonalReads="once") TR.saveHiResHeatmapWithOverlaps(filename[1]+"-1M-highRes.hm", resolution=50000, countDiagonalReads="twice") TR.saveSuperHighResMapWithOverlaps(filename[1]+"-5k-SuperHighRes.hm", resolution=5000,chromosomes = [14], countDiagonalReads="twice") Tb = h5dict(filename[1] + "-1M-byChr.hm")["1 1"] Tbb = h5dict(filename[1] + "-1M-byChr.hm")["1 2"] assert ((Tb - chrom1) == 0).all() assert ((Tbb - chrom12) == 0).all() assert ((Tb + np.diag(np.diag(Tb))) == b).all() print "Diagonal counting methods are consistent\n" newchrom1 = chrom1.copy() for i in xrange(len(newchrom1)): newchrom1[i,i] = 2 * newchrom1[i,i] Tb = h5dict(filename[1] + "-1M-highRes.hm")["1 1"] assert np.abs(Tb.sum() - newchrom1.sum()) < 1 assert np.sum(np.abs(coarsegrain(Tb,20,True) - newchrom1)) < 500 #------------------------------ print "Testing updateGenome method" from mirnylib.genome import Genome removeChromIDs = np.array([0, 1, 1, 1, 1] + [0] * 17 + [1] + [0]) #print ((removeChromIDs[TR.chrms1] == 1) + (removeChromIDs[TR.chrms2] == 1) ).sum() t = ((removeChromIDs[TR.chrms1] == 1) * (removeChromIDs[TR.chrms2] == 1)).sum() + ((removeChromIDs[TR.chrms1] == 1) * (TR.chrms2 == -1)).sum() newGenome = Genome(genomePath=genomeFolder, readChrms=["2", "3", "4", "5", "X"]) TR.updateGenome(newGenome) assert TR.N == t a = h5dict(filename[1] + "-1M.hm")["heatmap"]
def plotCisToTransHotFragments(dataset="../../../mouse/data/combined/"\ "mouse1_merged.frag", workingFile="../../../tcc/working/workingMouse.frag", cacheFile="../../../tcc/working/workingMouseFiltered.frag", genomeFolder="../../../data/mm9", label=None): mirnylib.systemutils.setExceptionHook() if not os.path.exists(cacheFile): print "caching parsed data" FH = HiCdataset(workingFile, genomeFolder) FH.load(dataset) FH.filterRsiteStart(offset=5) FH.filterDuplicates() #TR.save(filename[1]+".dat") FH.filterLarge() FH.maskFilter(FH.DS) FH.save(cacheFile) FH = HiCdataset(workingFile, genomeFolder) FH.load(cacheFile) fs = FH.fragmentSum() FH.saveFragments() FH.maskFilter(FH.chrms1 == FH.chrms2) FH.originalFragments() fsCis = FH.fragmentSum() args = numpy.argsort(fs) fsSort = 1. * fs[args] fsCisSort = 1. * fsCis[args] cisToTrans = fsCisSort / fsSort p1, p2, p3 = numpy.percentile(fsSort, [99, 99.5, 99.9]) bins = mirnylib.numutils.logbins(1, fsSort.max(), 1.08) counts = numpy.histogram(fsSort, bins) values = numpy.histogram(fsSort, bins, weights=cisToTrans) plt.plot(0.5 * (values[1][:-1] + values[1][1:]), values[0] / counts[0], '.', label=label) for linep in p1, p2, p3: plt.vlines(linep, 0, 1) plt.xlabel("Counts per fragment") plt.ylabel("Cis-to-trans ratio") plt.title("Vertical lines are at 99%,99.5% and 99.9% reads per fragment") niceShow()