def Generate_whole_genome_chromosome_file(): max_dist_kb = 100.0 min_dist_kb = 5.0 fragment_dataset_filename=base_folder+'fragment_dataset_'+base_filename+'.hdf5' o_file = base_out_folder + "Ps_distr/"+base_filename if not os.path.isfile(fragment_dataset_filename): fragments = fragmentHiC.HiCdataset( filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='w') fragments.parseInputData( dictLike=maped_reads_filepath, removeSS=True) fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) else: print "fragment_dataset "+ fragment_dataset_filename + " found.\n IMPORTANT: considering all requerd filters have been done" fragments = fragmentHiC.HiCdataset( filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='a') Ps = np.zeros(DistancetoBinN(max_dist_kb)+1) distances = fragments.distances l = len(distances) count = 0 for i in xrange(l): if (i % (l/10)) == 0: print (i / (l/10)),"0 %" d = distances[i] if d == -1: continue count += 1 d = abs(d)/1000.0 if (d > max_dist_kb) or ((d < min_dist_kb)): continue cell = DistancetoBinN(d) Ps[cell] += 1 Ps = Ps / count f_out = open (o_file+".ps","w") for i in xrange(len(Ps)): f_out.write(str(BinNtoDistance(i))) f_out.write("\t") f_out.write(str(Ps[i])) f_out.write("\n") f_out.close()
def step2(hiclib_path, sraid, res=1000000): ''' 2. Filter the dataset at the restriction fragment level. http://mirnylab.bitbucket.org/hiclib/tutorial/02_fragment_filtering.html ''' from mirnylib import genome from hiclib import fragmentHiC # Create a HiCdataset object. genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) fragments = fragmentHiC.HiCdataset(filename=sraid + '_fragment_dataset.hdf5', genome=genome_db, maximumMoleculeLength=500, enzymeName='HindIII', mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData(dictLike=sraid + '_mapped_reads.hdf5') fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() if sraid in ["SRR071231", "SRR071232"]: ## set to 0.1% for TCC fragments.filterExtreme(cutH=0.001, cutL=0) else: ## default for Hi-C is 0.5% fragments.filterExtreme(cutH=0.005, cutL=0) # fragments.saveFragments() fragments.saveHeatmap(sraid + '_map-res%sk.hdf5' % (res / 1000), resolution=res)
def filterFragments(genome_db): ''' Filter the data at the level of individual restriction fragments The following reads are remove from the dataset: - the reads that start within the 5 bp range from the restriction site - the identical read pairs, with both ends starting at exactly the same positions - the reads coming from extremely large and extremely small restriction fragments (length > 10^5 bp or length < 100 bp) - the reads coming from the top 1% most frequently detected restriction fragments The rationale behind each of the filters is discussed in the hiclib publication. The API documentation contains the description of the filters. ''' fragments = fragmentHiC.HiCdataset(filename=options.outputDir + options.experiment + '-fragment_dataset.hdf5', genome=genome_db, maximumMoleculeLength=500, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData(dictLike=options.outputDir + options.experiment + '-mapped_reads.hdf5') # save unfiltered data fragments.save(options.outputDir + options.experiment + '-fragment_unfiltered.hdf5') # Removes reads that start within 5 bp (default) near rsite fragments.filterRsiteStart() # Removes duplicate molecules in DS reads fragments.filterDuplicates() # Removes very large and small fragments fragments.filterLarge() # Removes fragments with most and/or least # counts fragments.filterExtreme(cutH=0.01, cutL=0) # Get Fragment weights fragments.calculateFragmentWeights() # save filtered data fragments.save(options.outputDir + options.experiment + '-fragment_filtered.hdf5') # save heatmap fragments.saveHeatmap(options.outputDir + options.experiment + '-1M.hdf5')
def func(): mapping.parse_sam( sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk, sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk, out_dict=mapped_reads, genome_db=genome_db, enzyme_name='DpnII') fragments = fragmentHiC.HiCdataset( filename=fragments_file, genome=genome_db, maximumMoleculeLength=700, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData(dictLike=reads_file)
genome_db.posBinCont[i] + genome_db.binSizesBp[curChrmIdx][curRelativeBinNumb]) for j in range(len(BD.dataDict['HindIII_GM_1'])): strToWrite += "\t" + str(BD.dataDict['HindIII_GM_1'][i][j]) strToWrite += "\n" f.write(strToWrite) f.close() ###################Step 1 - create heatmap with defined resolution################ if ((not os.path.isfile(fragment_dataset_filename)) or (not os.path.isfile(heatmap_filepath + '-raw'))): # Create a HiCdataset object. print "Crating new fragments dataset\n" fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. print "Filetering fragments\n" fragments.parseInputData(dictLike=maped_reads_filepath) fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) print "Saving raw heatmap\n" fragments.saveHeatmap(heatmap_filepath + '-raw', domain_res)
#!/usr/bin/env python import sys from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome basedir = sys.argv[1] genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir, readChrms=['1'], chrmFileTemplate="%s.fa") fragments = fragmentHiC.HiCdataset(filename='temp1', genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName="NcoI", inMemory=True) fragments.load('%s/Data/Timing/hiclib_data.hdf5' % basedir) fragments.filterDuplicates() fragments.filterExtreme(cutH=0, cutL=0.005) fragments.save('%s/Data/Timing/hiclib_data_filt.hdf5' % basedir)
from mirnylib import genome from hiclib import fragmentHiC # Create a HiCdataset object. genome_db = genome.Genome('../fasta/mm9', readChrms=['#', 'X']) fragments = fragmentHiC.HiCdataset( filename='../../data/serov/fragment_dataset_Sp.hdf5', genome=genome_db, maximumMoleculeLength=500, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData( dictLike='../../data/serov/mapped_reads_Sp.hdf5') fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) fragments.saveHeatmap('../../data/serov/heatmap-res-1M_Sp.hdf5', resolution=1000000)
from mirnylib import genome from hiclib import fragmentHiC genome_name = 'mm10' # Create a HiCdataset object. genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X']) fragments = fragmentHiC.HiCdataset( filename= '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/fragment_dataset_mESC.hdf5', genome=genome_db, maximumMoleculeLength=500, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData( dictLike= '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/mapped_reads_mESC.hdf5') fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) fragments.saveHeatmap( '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/heatmap-res-1M_mESC.hdf5', resolution=1000000)
binsize = int(binsize) if trans in ['1', 'true', 'True', 'TRUE']: trans = True else: trans = False genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa") temp = h5py.File(data_fname, 'r') if 'weights' in temp: weights = temp['weights'][...] else: weights = temp['fragmentWeights'][...] temp.close() fragments = fragmentHiC.HiCdataset( filename='temp', genome=genome_db, maximumMoleculeLength=500, mode='a', enzymeName=re_name, inMemory=True) fragments.load(data_fname) fragments.weights = weights fragments.fragmentWeights = weights fragments.vectors['weights'] = 'float32' if trans: heatmap = fragments.buildAllHeatmap(resolution=binsize, useWeights=True) fragments.genome.setResolution(binsize) chr_indices = numpy.r_[fragments.genome.chrmStartsBinCont(numpy.arange()), heatmap.shape[0]] output = h5py.File(out_fname, 'w') for i in range(chr_indices.shape[0] - 1): positions = numpy.zeros((chr_indices[i + 1] - chr_indices[i], 2), dtype=numpy.int32)
# "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Blood.topFPKM.ChrmLevel.regions"], "FibsLow": [ "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII_refined.frag", "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Fibs.test2FPKM.ChrmLevel.regions" ], # "BloodLow" : ["/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII_refined.frag", # "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Blood.lowFPKM.ChrmLevel.regions"] } ########################### for tmp_ind, key in enumerate(sorted(data)): if (tmp_ind == 0) or data[sorted(data)[tmp_ind - 1]] != key: fragments = fragmentHiC.HiCdataset(filename=data[key][0] + ".tmp", inMemory=True, genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName="HindIII") fragments.load(data[key][0]) assert fragments._isSorted() fragids1 = fragments._getVector("fragids1") fragids2 = fragments._getVector("fragids2") fs = fragments.fragmentSum( ) #contains total N of contacts for each fragment regions = np.genfromtxt(data[key][1], dtype=None) desierd_fragids = [] #contains ids of fragment froma desierd regions # id is a nt of rfrag midpoint + chrm*genome_db.fragIDmult print "---------------------------Processing ", key, "--------------------" print "defining desired fragments"
def plotCorrelationAtDifferentBinning(): """Plots figure with correlation at different binning. Note the caching and creating of binned heatmaps flags below. Suppplementary paper figure """ sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] setExceptionHook() cache = False create = False if create == True: if cache == True: #-------------------standard version code----------------- FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR.load("../../../ErezPaperData/hg18/GM-HindIII-hg18_refined.frag") FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR3.load("../../../ErezPaperData/hg18/GM-HindIII-hg18"\ "_refined.frag") FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR2.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") #----------------------cross-check code---------------- # FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") # # FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR3.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag") # # FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", # override=False, inMemory=True) # FR2.load("../../../ErezPaperData/hg18/G"\ # "M-HindIII-hg18_refined.frag") #-------end corss-check code --------------------------------- #--------Filter only trans DS reads----------------- FR.maskFilter(FR.DS * (FR.chrms1 != FR.chrms2)) FR2.maskFilter(FR2.DS * (FR2.chrms1 != FR2.chrms2)) FR3.maskFilter(FR3.DS * (FR3.chrms1 != FR3.chrms2)) #Now create two halfs of one dataset and down-sample second dataset #----------------------standard version code-------- fraction = 0.5 * len(FR.DS) / float(len(FR2.DS)) rarray = numpy.random.random(len(FR.DS)) mask1 = rarray < 0.5 mask3 = rarray >= 0.5 mask2 = numpy.random.random(len(FR2.DS)) < fraction #-------------------- cross-check code--------- #fraction = 0.5 * len(FR2.DS) / float(len(FR.DS)) #rarray = numpy.random.random(len(FR.DS)) #mask1 = rarray < fraction #mask3 = (rarray > fraction) * (rarray < fraction * 2) #mask2 = numpy.random.random(len(FR2.DS)) > 0.5 #----------------------------------------- FR.maskFilter(mask1) FR2.maskFilter(mask2) FR3.maskFilter(mask3) FR.save("../../../tcc/working/cache1") FR2.save("../../../tcc/working/cache2") FR3.save("../../../tcc/working/cache3") else: FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR.load("../../../tcc/working/cache1") FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR3.load("../../../tcc/working/cache3") FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18", override=False, inMemory=True) FR2.load("../../../tcc/working/cache2") for size in sizes: FR.saveHeatmap("../../../tcc/working/HindIII_%d.hm" % size, size * 1000000) FR2.saveHeatmap("../../../tcc/working/NcoI_%d.hm" % size, size * 1000000) FR3.saveHeatmap("../../../tcc/working/control_%d.hm" % size, size * 1000000) p1 = [] p2 = [] p3 = [] p4 = [] evs = [] for size in sizes: BD = binnedDataAnalysis(size * 1000000, "../../../data/hg18") BD.simpleLoad("../../../tcc/working/HindIII_%d.hm" % size, "HindIII") BD.simpleLoad("../../../tcc/working/NcoI_%d.hm" % size, "NcoI") BD.simpleLoad("../../../tcc/working/control_%d.hm" % size, "control") BD.removeDiagonal() BD.removePoorRegions(cutoff=2) BD.removeCis() data1 = BD.dataDict["HindIII"] data2 = BD.dataDict["NcoI"] data3 = BD.dataDict["control"] mask = (numpy.sum( data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0) validMask = mask[:, None] * mask[None, :] transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :] cormask = transmask * validMask c1 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0] c4 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0] if size == 1: evs.append(BD.interchromosomalValues("HindIII")) evs.append(BD.interchromosomalValues("NcoI")) evs.append(BD.interchromosomalValues("control")) p4.append(c4) p1.append(c1) print "size\t%d\traw:" % size, c1, BD.removeZeros() BD.fakeCis() # does iterative correction as well BD.restoreZeros(value=0) data1 = BD.dataDict["HindIII"] data2 = BD.dataDict["NcoI"] data3 = BD.dataDict["control"] c2 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0] c3 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0] if size == 1: evs.append(BD.interchromosomalValues("HindIII")) evs.append(BD.interchromosomalValues("NcoI")) evs.append(BD.interchromosomalValues("control")) print evs p3.append(c3) p2.append(c2) print "\tcorrected:", c2, "\tcontrol", c3 plt.plot(sizes, p1, label="Raw data, between enzymes") plt.plot(sizes, p2, label="Iteratively corrected, between") plt.plot(sizes, p3, label="IC, within") plt.xlabel("Bin size, MB") plt.xticks(range(1, 11)) plt.ylabel("Spearman correlation coefficient") plt.legend() niceShow() setExceptionHook() 0 / 0
from itertools import repeat import argparse import tempfile parser = argparse.ArgumentParser() parser.add_argument("sample") args = parser.parse_args() sample = args.sample genome_db = genome.Genome('../genomes/mm9/fasta', readChrms=['#', 'X']) genome_db.setEnzyme('DpnII') path = '/exports/eddie/scratch/s1529682/processed/' fragments = fragmentHiC.HiCdataset(filename='bla', inMemory=True, genome=genome_db, maximumMoleculeLength=750, mode='w', tmpFolder=tempfile.gettempdir()) fragments.load(path + 'merged/' + sample + '_fragment_dataset_filtered.hdf5') juicepath = path + 'merged/for_juicebox/' + sample + '_for_juicebox.txt' with open(juicepath, 'w') as f: for str1, str2, chr1, chr2, pos1, pos2, dist1, dist2, mapq in zip( fragments.h5dict['strands1'], fragments.h5dict['strands2'], fragments.h5dict['chrms1'], fragments.h5dict['chrms2'], fragments.h5dict['cuts1'], fragments.h5dict['cuts2'], fragments._getVector('dists1'), fragments._getVector('dists2'), repeat(32)): chr1 = 'chr' + str(chr1 + 1) if chr1 < 19 else 'chrX' chr2 = 'chr' + str(chr2 + 1) if chr2 < 19 else 'chrX'
parser = argparse.ArgumentParser() parser.add_argument("sample") args = parser.parse_args() sample = args.sample genome_db = genome.Genome('../genomes/mm9/fasta', readChrms=['#', 'X']) genome_db.setEnzyme('DpnII') path = '/exports/eddie/scratch/s1529682/processed/' files = glob(path + sample + '*fragment_dataset.hdf5') print(sample) fragments = fragmentHiC.HiCdataset( filename=path + 'merged/' + sample + '_fragment_dataset.hdf5', genome=genome_db, maximumMoleculeLength=750, mode='w', tmpFolder='/exports/eddie/scratch/s1529682/tmp') fragments.merge(files) print('Merged', sample) fragments = fragmentHiC.HiCdataset( filename=path + 'merged/' + sample + '_fragment_dataset_filtered.hdf5', genome=genome_db, maximumMoleculeLength=750, mode='w', tmpFolder='/exports/eddie/scratch/s1529682/tmp') #fragments._sortData() fragments.load(path + 'merged/' + sample + '_fragment_dataset.hdf5') fragments.filterDuplicates(mode='ram') fragments.filterLarge(10000, 10) # DpnII fragments.filterExtreme(cutH=0.0001, cutL=0)
def filtration(chromosome_names, cell_line, path, genome_version, enzyme, resolution_list): for chrm_list in chromosome_names: genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile='gap.txt', readChrms=chrm_list, forceOrder=True) # Read mapped reads if len(chrm_list) > 1: fragments = fragmentHiC.HiCdataset(filename=path + 'filtered_maps/' + cell_line + '/fragment_dataset_full.hdf5', genome=genome_db, enzymeName=enzyme, mode='w') fragments.parseInputData(dictLike=path + 'maps/' + cell_line + '/mapped_reads_full.hdf5') else: fragments = fragmentHiC.HiCdataset( filename=path + 'filtered_maps/' + cell_line + '/fragment_dataset_' + chrm_list[0] + '.hdf5', genome=genome_db, enzymeName=enzyme, mode='w') fragments.parseInputData(dictLike=path + 'maps/' + cell_line + '/mapped_reads_' + chrm_list[0] + '.hdf5') # Apply filters fragments.filterDuplicates() # Save statistics if len(chrm_list) > 1 or len(chromosome_names) == 1: fragments.writeFilteringStats() fragments.printMetadata(saveTo=path + 'processing_stats/' + cell_line + '/processing_stats_' + cell_line + '.txt') if len(chrm_list) == 1 and len(chromosome_names) > 1: fragments.writeFilteringStats() fragments.printMetadata(saveTo=path + 'processing_stats/' + cell_line + '/processing_stats_' + cell_line + '_' + chrm_list[0] + '.txt') # Sort reads and calculate contact probability (both normalized and not) fragments._sortData() if len(chrm_list) > 1: contact_probs = fragments.plotScaling(normalize=True, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_full_norm.txt', header=1, index=0, sep='\t') contact_probs = fragments.plotScaling(normalize=False, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_full.txt', header=1, index=0, sep='\t') if len(chrm_list) == 1: contact_probs = fragments.plotScaling(normalize=True, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_' + chrm_list[0] + '_norm.txt', header=1, index=0, sep='\t') contact_probs = fragments.plotScaling(normalize=False, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_' + chrm_list[0] + '.txt', header=1, index=0, sep='\t') # Save into .cool and .hdf5 files for resolution in resolution_list: fragments.saveCooler(filename=path + 'filtered_maps/' + cell_line + '/heatmap-' + chrm_list[0] + '-' + str(resolution / 1000) + 'K.cool', resolution=resolution)
def Generate_one_chromosome_file(chrNumb): o_file = base_out_folder + "fitHiC/i_files/" + base_filename + ".fithic" fragment_dataset_filename = base_out_folder + "fitHiC/i_files/" + 'fragment_dataset_' + base_filename + '_chr' + str( chrNumb) + '.hdf5' if not os.path.isfile(fragment_dataset_filename): fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='w') fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True) fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) else: fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='a') print "Filtering, leaving only chr ", genome_db.idx2label[chrNumb] #leave only frgaments from the chrNumb (nterchromosomal) fragments.maskFilter((fragments.chrms1 == chrNumb)) fragments.maskFilter((fragments.chrms2 == chrNumb)) print "Seting RE" #Setting info about restriction enzyme, calculating absolute indexes fragments.setRfragAbsIdxs('HindIII') numBins = len(fragments.genome.rsites[chrNumb]) print "Total numBins (RSites) on chr ", genome_db.idx2label[ chrNumb], " = ", numBins rfragAbsIdxs1 = fragments.rfragAbsIdxs1 - fragments.genome.chrmStartsRfragCont[ chrNumb] rfragAbsIdxs2 = fragments.rfragAbsIdxs2 - fragments.genome.chrmStartsRfragCont[ chrNumb] print "Total number of fragments = ", len(rfragAbsIdxs1) if len(rfragAbsIdxs1) != len(rfragAbsIdxs2): print "rfragAbsIdxs1=", rfragAbsIdxs1 print "rfragAbsIdxs2=", rfragAbsIdxs2 print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1) print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 1!!!" if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0): print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1) print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 2!!!" if (max(rfragAbsIdxs1) > numBins - 1 or max(rfragAbsIdxs2) > numBins - 1): print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1) print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2) print "numBins=", numBins raise "FRAGMENT INDEXING ERROR 3!!!" print "FRAGMENT INDEXING - passed" #Creating label array label = np.array(rfragAbsIdxs1, dtype='int64') label *= numBins label += rfragAbsIdxs2 #Creating count array counts = np.bincount(label, minlength=numBins**2) counts.shape = (numBins, numBins) #Counting for i in xrange(len(counts)): counts[i, i:] += counts[i:, i] counts[i:, i] = counts[i, i:] #Filling diagonal reads #diag = np.diag(counts) #fillDiagonal(counts, diag/2) fillDiagonal(counts, 0) BinsToDescribe = np.zeros( numBins ) # Info about which RSites should be described in .fragments file later # f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".contacts.zip","w") f_out = open(o_file + "_chr" + str(chrNumb) + ".contacts.zip", "w") print "Writing file ", o_file + "_chr" + str(chrNumb) + ".contacts.zip" for i in range(numBins - 1): for j in range(i + 1, numBins): if (counts[i, j] != 0): s = "" s += str(chrNumb) + "\t" s += str(fragments.genome.rfragMids[chrNumb][i]) + "\t" s += str(chrNumb) + "\t" s += str(fragments.genome.rfragMids[chrNumb][j]) + "\t" s += str(counts[i, j]) + "\n" f_out.write(s) BinsToDescribe[i] = 1 BinsToDescribe[j] = 1 f_out.close() # f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".fragments.zip","w") f_out = open(o_file + "_chr" + str(chrNumb) + ".fragments.zip", "w") print "Writing file ", o_file + "_chr" + str(chrNumb) + ".fragments.zip" for ind, val in enumerate(BinsToDescribe): if (val == 1): s = "" s += str(chrNumb) + "\t0\t" s += str(fragments.genome.rfragMids[chrNumb][ind]) + "\t" s += str(sum(counts[ind])) + "\t" s += "1\n" f_out.write(s) f_out.close()
def Generate_whole_genome_chromosome_file( mappability=1): #TODO - use mappability, now it's always =1 fragment_dataset_filename = base_folder + 'fragment_dataset_' + base_filename + '.hdf5' o_file = base_out_folder + "fitHiC/i_files/" + base_filename if not os.path.isfile(fragment_dataset_filename): fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='w') fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True) fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) else: print "fragment_dataset " + fragment_dataset_filename + " found.\n IMPORTANT: considering all requerd filters have been done" fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename, genome=genome_db, maximumMoleculeLength=500, mode='a') print "Seting RE" #Setting info about restriction enzyme, calculating absolute indexes fragments.setRfragAbsIdxs('HindIII') rfragAbsIdxs1 = fragments.rfragAbsIdxs1 rfragAbsIdxs2 = fragments.rfragAbsIdxs2 print "Total number of fragments = ", len(rfragAbsIdxs1) if len(rfragAbsIdxs1) != len(rfragAbsIdxs2): print "rfragAbsIdxs1=", rfragAbsIdxs1 print "rfragAbsIdxs2=", rfragAbsIdxs2 print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1) print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 1!!!" if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0): print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1) print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2) raise "FRAGMENT INDEXING ERROR 2!!!" if (max(rfragAbsIdxs1) > fragments.genome.chrmEndsRfragCont[-1] or max(rfragAbsIdxs2) > fragments.genome.chrmEndsRfragCont[-1]): print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1) print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2) print "numBins=", fragments.genome.chrmEndsRfragCont[-1] raise "FRAGMENT INDEXING ERROR 3!!!" print "FRAGMENT INDEXING - passed" print "Initialyzing heatmap" max_rsites_number = max([len(i) for i in fragments.genome.rsites]) heatmap = np.zeros(shape=(fragments.genome.chrmCount, max_rsites_number, max_rsites_number), dtype=np.uint16) print "Max numBins (RSites) in chromosome= ", max_rsites_number print "Creating chrRsites array" RsiteToChr = np.zeros(max_rsites_number * fragments.genome.chrmCount, dtype=np.uint8) for i in range(0, fragments.genome.chrmCount): RsiteToChr[fragments.genome.chrmStartsRfragCont[i]:fragments.genome. chrmEndsRfragCont[i]] = i print "Filling heatmap" BinsToDescribe = np.zeros(shape=(fragments.genome.chrmCount, max_rsites_number), dtype=np.int8) l = len(rfragAbsIdxs1) for i in xrange(l): if (i % (l / 10)) == 0: print(i / (l / 10)), "0 %" ChrN1 = RsiteToChr[rfragAbsIdxs1[i]] ChrN2 = RsiteToChr[rfragAbsIdxs2[i]] if (ChrN1 == ChrN2): #if it is intrachromosomal contact rfragAbsIdxs1_onChr = rfragAbsIdxs1[ i] - fragments.genome.chrmStartsRfragCont[ChrN1] rfragAbsIdxs2_onChr = rfragAbsIdxs2[ i] - fragments.genome.chrmStartsRfragCont[ChrN1] BinsToDescribe[ChrN1][rfragAbsIdxs1_onChr] = 1 BinsToDescribe[ChrN2][rfragAbsIdxs2_onChr] = 1 heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] += 1 if heatmap[ChrN1][rfragAbsIdxs1_onChr][ rfragAbsIdxs2_onChr] >= 64000: raise "Type int16 used in heatmap is not compatible with N of contact >64000" f_out = open(o_file + ".contacts", "w") print "Total number of non-empty bins (rfrags)=", np.sum(BinsToDescribe) print "Writing file ", o_file + ".contacts" f_out = open(o_file + ".contacts", "w") for i in xrange(l): if (i % (l / 10)) == 0: print(i / (l / 10)), "0 %" ChrN1 = RsiteToChr[rfragAbsIdxs1[i]] ChrN2 = RsiteToChr[rfragAbsIdxs2[i]] if (ChrN1 == ChrN2): #if it is intrachromosomal contact rfragAbsIdxs1_onChr = rfragAbsIdxs1[ i] - fragments.genome.chrmStartsRfragCont[ChrN1] rfragAbsIdxs2_onChr = rfragAbsIdxs2[ i] - fragments.genome.chrmStartsRfragCont[ChrN1] if (heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] != -1): s = "" s += str(i) + "\t" s += str(fragments.genome.rfragMids[ChrN1] [rfragAbsIdxs1_onChr]) + "\t" s += str(i) + "\t" s += str(fragments.genome.rfragMids[ChrN2] [rfragAbsIdxs2_onChr]) + "\t" s += str(heatmap[ChrN1, rfragAbsIdxs1_onChr, rfragAbsIdxs2_onChr]) + "\n" heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] = -1 f_out.write(s) f_out.close() f_out = open(o_file + ".fragments", "w") print "Writing file ", o_file + ".fragments" for i in range(fragments.genome.chrmCount): for j in xrange(max_rsites_number): if (BinsToDescribe[i][j] == 1): s = "" chrNumb = i s += str(chrNumb) + "\t0\t" s += str(fragments.genome.rfragMids[i][j]) + "\t" s += str(sum(heatmap[i][j])) + "\t" s += "1\n" f_out.write(s) f_out.close()