Ejemplo n.º 1
0
def Generate_whole_genome_chromosome_file():
	max_dist_kb = 100.0
	min_dist_kb = 5.0


	fragment_dataset_filename=base_folder+'fragment_dataset_'+base_filename+'.hdf5'
	o_file = base_out_folder + "Ps_distr/"+base_filename

	if not os.path.isfile(fragment_dataset_filename):
		fragments = fragmentHiC.HiCdataset(
			filename=fragment_dataset_filename,
			genome=genome_db,
			maximumMoleculeLength=500,
			mode='w')
		fragments.parseInputData(
              dictLike=maped_reads_filepath, removeSS=True)

		fragments.filterRsiteStart(offset=5)
		fragments.filterDuplicates()
		fragments.filterLarge()
		fragments.filterExtreme(cutH=0.005, cutL=0)
	else:
		print "fragment_dataset "+ fragment_dataset_filename + " found.\n IMPORTANT: considering all requerd filters have been done"
		fragments = fragmentHiC.HiCdataset(
			filename=fragment_dataset_filename,
			genome=genome_db,
			maximumMoleculeLength=500,
			mode='a')

	Ps = np.zeros(DistancetoBinN(max_dist_kb)+1)
	
	distances = fragments.distances
	l = len(distances)
	
	count = 0
	for i in xrange(l):
		if (i % (l/10)) == 0:
			print (i / (l/10)),"0 %"
		d = distances[i]
		if d == -1:
			continue
		count += 1
		d = abs(d)/1000.0
		if (d > max_dist_kb) or ((d < min_dist_kb)):
				continue
		
		cell = DistancetoBinN(d)
		Ps[cell] += 1
			
	Ps = Ps / count

	f_out = open (o_file+".ps","w")
	for i in xrange(len(Ps)):
		f_out.write(str(BinNtoDistance(i)))
		f_out.write("\t")
		f_out.write(str(Ps[i]))
		f_out.write("\n")
	f_out.close()
Ejemplo n.º 2
0
def step2(hiclib_path, sraid, res=1000000):
    ''' 2. Filter the dataset at the restriction fragment level.
        http://mirnylab.bitbucket.org/hiclib/tutorial/02_fragment_filtering.html
    '''
    from mirnylib import genome
    from hiclib import fragmentHiC

    # Create a HiCdataset object.
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])
    fragments = fragmentHiC.HiCdataset(filename=sraid +
                                       '_fragment_dataset.hdf5',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       enzymeName='HindIII',
                                       mode='w')

    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the
    # object.
    fragments.parseInputData(dictLike=sraid + '_mapped_reads.hdf5')

    fragments.filterRsiteStart(offset=5)
    fragments.filterDuplicates()
    fragments.filterLarge()
    if sraid in ["SRR071231", "SRR071232"]:  ## set to 0.1% for TCC
        fragments.filterExtreme(cutH=0.001, cutL=0)
    else:  ## default for Hi-C is 0.5%
        fragments.filterExtreme(cutH=0.005, cutL=0)


#    fragments.saveFragments()
    fragments.saveHeatmap(sraid + '_map-res%sk.hdf5' % (res / 1000),
                          resolution=res)
Ejemplo n.º 3
0
def filterFragments(genome_db):
    '''
	Filter the data at the level of individual restriction fragments

	The following reads are remove from the dataset:

	- the reads that start within the 5 bp range from the restriction site
	- the identical read pairs, with both ends starting at exactly the same positions
	- the reads coming from extremely large and extremely small restriction fragments (length > 10^5 bp or length < 100 bp)
	- the reads coming from the top 1% most frequently detected restriction fragments

	The rationale behind each of the filters is discussed in the hiclib publication. The API documentation contains the description of the filters.
	'''

    fragments = fragmentHiC.HiCdataset(filename=options.outputDir +
                                       options.experiment +
                                       '-fragment_dataset.hdf5',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       mode='w')

    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the
    # object.
    fragments.parseInputData(dictLike=options.outputDir + options.experiment +
                             '-mapped_reads.hdf5')

    # save unfiltered data
    fragments.save(options.outputDir + options.experiment +
                   '-fragment_unfiltered.hdf5')

    # Removes reads that start within 5 bp (default) near rsite
    fragments.filterRsiteStart()

    # Removes duplicate molecules in DS reads
    fragments.filterDuplicates()

    # Removes very large and small fragments
    fragments.filterLarge()

    # Removes fragments with most and/or least # counts
    fragments.filterExtreme(cutH=0.01, cutL=0)

    # Get Fragment weights
    fragments.calculateFragmentWeights()

    # save filtered data
    fragments.save(options.outputDir + options.experiment +
                   '-fragment_filtered.hdf5')

    # save heatmap
    fragments.saveHeatmap(options.outputDir + options.experiment + '-1M.hdf5')
Ejemplo n.º 4
0
def func():
    mapping.parse_sam(
        sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk,
        sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk,
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='DpnII')
    fragments = fragmentHiC.HiCdataset(
        filename=fragments_file,
        genome=genome_db,
        maximumMoleculeLength=700,
        mode='w')
    
    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the 
    # object.
    fragments.parseInputData(dictLike=reads_file)
Ejemplo n.º 5
0
                genome_db.posBinCont[i] +
                genome_db.binSizesBp[curChrmIdx][curRelativeBinNumb])
        for j in range(len(BD.dataDict['HindIII_GM_1'])):
            strToWrite += "\t" + str(BD.dataDict['HindIII_GM_1'][i][j])
        strToWrite += "\n"
        f.write(strToWrite)
    f.close()


###################Step 1 - create heatmap with defined resolution################
if ((not os.path.isfile(fragment_dataset_filename))
        or (not os.path.isfile(heatmap_filepath + '-raw'))):
    # Create a HiCdataset object.
    print "Crating new fragments dataset\n"
    fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       mode='w')
    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the
    # object.
    print "Filetering fragments\n"
    fragments.parseInputData(dictLike=maped_reads_filepath)

    fragments.filterRsiteStart(offset=5)
    fragments.filterDuplicates()
    fragments.filterLarge()
    fragments.filterExtreme(cutH=0.005, cutL=0)

    print "Saving raw heatmap\n"
    fragments.saveHeatmap(heatmap_filepath + '-raw', domain_res)
Ejemplo n.º 6
0
#!/usr/bin/env python

import sys

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

basedir = sys.argv[1]
genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir,
                          readChrms=['1'],
                          chrmFileTemplate="%s.fa")
fragments = fragmentHiC.HiCdataset(filename='temp1',
                                   genome=genome_db,
                                   maximumMoleculeLength=500,
                                   mode='w',
                                   enzymeName="NcoI",
                                   inMemory=True)
fragments.load('%s/Data/Timing/hiclib_data.hdf5' % basedir)
fragments.filterDuplicates()
fragments.filterExtreme(cutH=0, cutL=0.005)
fragments.save('%s/Data/Timing/hiclib_data_filt.hdf5' % basedir)
Ejemplo n.º 7
0
from mirnylib import genome
from hiclib import fragmentHiC 

# Create a HiCdataset object.
genome_db = genome.Genome('../fasta/mm9', readChrms=['#', 'X'])
fragments = fragmentHiC.HiCdataset(
    filename='../../data/serov/fragment_dataset_Sp.hdf5',
    genome=genome_db,
    maximumMoleculeLength=500,
    mode='w')

# Load the parsed reads into the HiCdataset. The dangling-end filter is applied
# at this stage, with maximumMoleculeLength specified at the initiation of the 
# object.
fragments.parseInputData(
    dictLike='../../data/serov/mapped_reads_Sp.hdf5')

fragments.filterRsiteStart(offset=5)
fragments.filterDuplicates()
fragments.filterLarge()
fragments.filterExtreme(cutH=0.005, cutL=0)

fragments.saveHeatmap('../../data/serov/heatmap-res-1M_Sp.hdf5', resolution=1000000)
Ejemplo n.º 8
0
from mirnylib import genome
from hiclib import fragmentHiC

genome_name = 'mm10'

# Create a HiCdataset object.
genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X'])
fragments = fragmentHiC.HiCdataset(
    filename=
    '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/fragment_dataset_mESC.hdf5',
    genome=genome_db,
    maximumMoleculeLength=500,
    mode='w')

# Load the parsed reads into the HiCdataset. The dangling-end filter is applied
# at this stage, with maximumMoleculeLength specified at the initiation of the
# object.
fragments.parseInputData(
    dictLike=
    '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/mapped_reads_mESC.hdf5')

fragments.filterRsiteStart(offset=5)
fragments.filterDuplicates()
fragments.filterLarge()
fragments.filterExtreme(cutH=0.005, cutL=0)

fragments.saveHeatmap(
    '/ifs/DATA/opistorchis/Fishman/data/Sample/mESC/heatmap-res-1M_mESC.hdf5',
    resolution=1000000)
Ejemplo n.º 9
0
binsize = int(binsize)
if trans in ['1', 'true', 'True', 'TRUE']:
	trans = True
else:
	trans = False
genome_db    = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa")
temp = h5py.File(data_fname, 'r')
if 'weights' in temp:
    weights = temp['weights'][...]
else:
    weights = temp['fragmentWeights'][...]
temp.close()
fragments = fragmentHiC.HiCdataset(
    filename='temp',
    genome=genome_db,
    maximumMoleculeLength=500,
    mode='a',
    enzymeName=re_name,
    inMemory=True)
fragments.load(data_fname)
fragments.weights = weights
fragments.fragmentWeights = weights
fragments.vectors['weights'] = 'float32'

if trans:
    heatmap = fragments.buildAllHeatmap(resolution=binsize, useWeights=True)
    fragments.genome.setResolution(binsize)
    chr_indices = numpy.r_[fragments.genome.chrmStartsBinCont(numpy.arange()), heatmap.shape[0]]
    output = h5py.File(out_fname, 'w')
    for i in range(chr_indices.shape[0] - 1):
        positions = numpy.zeros((chr_indices[i + 1] - chr_indices[i], 2), dtype=numpy.int32)
Ejemplo n.º 10
0
    #			"/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Blood.topFPKM.ChrmLevel.regions"],
    "FibsLow": [
        "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII_refined.frag",
        "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Fibs.test2FPKM.ChrmLevel.regions"
    ],
    #	"BloodLow" : ["/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII_refined.frag",
    #			"/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_genomic.gff.Blood.lowFPKM.ChrmLevel.regions"]
}

###########################

for tmp_ind, key in enumerate(sorted(data)):
    if (tmp_ind == 0) or data[sorted(data)[tmp_ind - 1]] != key:
        fragments = fragmentHiC.HiCdataset(filename=data[key][0] + ".tmp",
                                           inMemory=True,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='w',
                                           enzymeName="HindIII")
        fragments.load(data[key][0])

        assert fragments._isSorted()
        fragids1 = fragments._getVector("fragids1")
        fragids2 = fragments._getVector("fragids2")
        fs = fragments.fragmentSum(
        )  #contains total N of contacts for each fragment

    regions = np.genfromtxt(data[key][1], dtype=None)
    desierd_fragids = []  #contains ids of fragment froma desierd regions
    # id is a nt of rfrag midpoint + chrm*genome_db.fragIDmult
    print "---------------------------Processing ", key, "--------------------"
    print "defining desired fragments"
Ejemplo n.º 11
0
def plotCorrelationAtDifferentBinning():
    """Plots figure with correlation at different binning.
    Note the caching and creating of binned heatmaps flags below.
    Suppplementary paper figure
    """

    sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    setExceptionHook()

    cache = False
    create = False

    if create == True:
        if cache == True:
            #-------------------standard version code-----------------
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../ErezPaperData/hg18/GM-HindIII-hg18_refined.frag")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../ErezPaperData/hg18/GM-HindIII-hg18"\
                     "_refined.frag")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")

            #----------------------cross-check code----------------
#            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                        override=False, inMemory=True)
#            FR.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR3.load("../../../ErezPaperData/hg18/GM-NcoI-hg18_refined.frag")
#
#            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
#                                         override=False, inMemory=True)
#            FR2.load("../../../ErezPaperData/hg18/G"\
#                    "M-HindIII-hg18_refined.frag")
            #-------end corss-check code ---------------------------------
            #--------Filter only trans DS reads-----------------
            FR.maskFilter(FR.DS * (FR.chrms1 != FR.chrms2))
            FR2.maskFilter(FR2.DS * (FR2.chrms1 != FR2.chrms2))
            FR3.maskFilter(FR3.DS * (FR3.chrms1 != FR3.chrms2))

            #Now create two halfs of one dataset and down-sample second dataset
            #----------------------standard version code--------
            fraction = 0.5 * len(FR.DS) / float(len(FR2.DS))

            rarray = numpy.random.random(len(FR.DS))
            mask1 = rarray < 0.5
            mask3 = rarray >= 0.5
            mask2 = numpy.random.random(len(FR2.DS)) < fraction

            #-------------------- cross-check code---------
            #fraction = 0.5 * len(FR2.DS) / float(len(FR.DS))

            #rarray = numpy.random.random(len(FR.DS))
            #mask1 =  rarray  < fraction
            #mask3 = (rarray > fraction) * (rarray < fraction * 2)
            #mask2 =  numpy.random.random(len(FR2.DS)) > 0.5
            #-----------------------------------------

            FR.maskFilter(mask1)
            FR2.maskFilter(mask2)
            FR3.maskFilter(mask3)

            FR.save("../../../tcc/working/cache1")
            FR2.save("../../../tcc/working/cache2")
            FR3.save("../../../tcc/working/cache3")
        else:
            FR = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                        override=False, inMemory=True)
            FR.load("../../../tcc/working/cache1")

            FR3 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR3.load("../../../tcc/working/cache3")

            FR2 = fragmentHiC.HiCdataset("bla", "../../../data/hg18",
                                         override=False, inMemory=True)
            FR2.load("../../../tcc/working/cache2")

        for size in sizes:
            FR.saveHeatmap("../../../tcc/working/HindIII_%d.hm" %
                           size, size * 1000000)
            FR2.saveHeatmap("../../../tcc/working/NcoI_%d.hm" %
                            size, size * 1000000)
            FR3.saveHeatmap("../../../tcc/working/control_%d.hm" %
                            size, size * 1000000)

    p1 = []
    p2 = []
    p3 = []
    p4 = []
    evs = []
    for size in sizes:

        BD = binnedDataAnalysis(size * 1000000, "../../../data/hg18")
        BD.simpleLoad("../../../tcc/working/HindIII_%d.hm" % size, "HindIII")
        BD.simpleLoad("../../../tcc/working/NcoI_%d.hm" % size, "NcoI")
        BD.simpleLoad("../../../tcc/working/control_%d.hm" % size, "control")
        BD.removeDiagonal()
        BD.removePoorRegions(cutoff=2)
        BD.removeCis()

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]

        mask = (numpy.sum(
            data1, axis=0) > 0) * (numpy.sum(data2, axis=0) > 0)
        validMask = mask[:, None] * mask[None, :]
        transmask = BD.chromosomeIndex[:, None] != BD.chromosomeIndex[None, :]
        cormask = transmask * validMask

        c1 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c4 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
        p4.append(c4)
        p1.append(c1)

        print "size\t%d\traw:" % size, c1,
        BD.removeZeros()
        BD.fakeCis()  # does iterative correction as well
        BD.restoreZeros(value=0)

        data1 = BD.dataDict["HindIII"]
        data2 = BD.dataDict["NcoI"]
        data3 = BD.dataDict["control"]
        c2 = scipy.stats.spearmanr(data1[cormask], data2[cormask])[0]
        c3 = scipy.stats.spearmanr(data1[cormask], data3[cormask])[0]

        if size == 1:
            evs.append(BD.interchromosomalValues("HindIII"))
            evs.append(BD.interchromosomalValues("NcoI"))
            evs.append(BD.interchromosomalValues("control"))
            print evs

        p3.append(c3)
        p2.append(c2)

        print "\tcorrected:", c2, "\tcontrol", c3

    plt.plot(sizes, p1, label="Raw data, between enzymes")
    plt.plot(sizes, p2, label="Iteratively corrected, between")
    plt.plot(sizes, p3, label="IC, within")
    plt.xlabel("Bin size, MB")
    plt.xticks(range(1, 11))
    plt.ylabel("Spearman correlation coefficient")
    plt.legend()
    niceShow()

    setExceptionHook()
    0 / 0
Ejemplo n.º 12
0
from itertools import repeat
import argparse
import tempfile

parser = argparse.ArgumentParser()
parser.add_argument("sample")
args = parser.parse_args()
sample = args.sample

genome_db = genome.Genome('../genomes/mm9/fasta', readChrms=['#', 'X'])
genome_db.setEnzyme('DpnII')

path = '/exports/eddie/scratch/s1529682/processed/'
fragments = fragmentHiC.HiCdataset(filename='bla',
                                   inMemory=True,
                                   genome=genome_db,
                                   maximumMoleculeLength=750,
                                   mode='w',
                                   tmpFolder=tempfile.gettempdir())
fragments.load(path + 'merged/' + sample + '_fragment_dataset_filtered.hdf5')

juicepath = path + 'merged/for_juicebox/' + sample + '_for_juicebox.txt'

with open(juicepath, 'w') as f:
    for str1, str2, chr1, chr2, pos1, pos2, dist1, dist2, mapq in zip(
            fragments.h5dict['strands1'], fragments.h5dict['strands2'],
            fragments.h5dict['chrms1'], fragments.h5dict['chrms2'],
            fragments.h5dict['cuts1'], fragments.h5dict['cuts2'],
            fragments._getVector('dists1'), fragments._getVector('dists2'),
            repeat(32)):
        chr1 = 'chr' + str(chr1 + 1) if chr1 < 19 else 'chrX'
        chr2 = 'chr' + str(chr2 + 1) if chr2 < 19 else 'chrX'
Ejemplo n.º 13
0
parser = argparse.ArgumentParser()
parser.add_argument("sample")
args = parser.parse_args()
sample = args.sample

genome_db = genome.Genome('../genomes/mm9/fasta', readChrms=['#', 'X'])
genome_db.setEnzyme('DpnII')
path = '/exports/eddie/scratch/s1529682/processed/'

files = glob(path + sample + '*fragment_dataset.hdf5')

print(sample)
fragments = fragmentHiC.HiCdataset(
    filename=path + 'merged/' + sample + '_fragment_dataset.hdf5',
    genome=genome_db,
    maximumMoleculeLength=750,
    mode='w',
    tmpFolder='/exports/eddie/scratch/s1529682/tmp')
fragments.merge(files)
print('Merged', sample)
fragments = fragmentHiC.HiCdataset(
    filename=path + 'merged/' + sample + '_fragment_dataset_filtered.hdf5',
    genome=genome_db,
    maximumMoleculeLength=750,
    mode='w',
    tmpFolder='/exports/eddie/scratch/s1529682/tmp')
#fragments._sortData()
fragments.load(path + 'merged/' + sample + '_fragment_dataset.hdf5')
fragments.filterDuplicates(mode='ram')
fragments.filterLarge(10000, 10)  # DpnII
fragments.filterExtreme(cutH=0.0001, cutL=0)
Ejemplo n.º 14
0
def filtration(chromosome_names, cell_line, path, genome_version, enzyme,
               resolution_list):
    for chrm_list in chromosome_names:
        genome_db = genome.Genome('/home/magnitov/data/genomes/' +
                                  genome_version,
                                  gapFile='gap.txt',
                                  readChrms=chrm_list,
                                  forceOrder=True)
        # Read mapped reads
        if len(chrm_list) > 1:
            fragments = fragmentHiC.HiCdataset(filename=path +
                                               'filtered_maps/' + cell_line +
                                               '/fragment_dataset_full.hdf5',
                                               genome=genome_db,
                                               enzymeName=enzyme,
                                               mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_full.hdf5')
        else:
            fragments = fragmentHiC.HiCdataset(
                filename=path + 'filtered_maps/' + cell_line +
                '/fragment_dataset_' + chrm_list[0] + '.hdf5',
                genome=genome_db,
                enzymeName=enzyme,
                mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_' + chrm_list[0] + '.hdf5')
        # Apply filters
        fragments.filterDuplicates()
        # Save statistics
        if len(chrm_list) > 1 or len(chromosome_names) == 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '.txt')
        if len(chrm_list) == 1 and len(chromosome_names) > 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '_' + chrm_list[0] + '.txt')

        # Sort reads and calculate contact probability (both normalized and not)
        fragments._sortData()
        if len(chrm_list) > 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full.txt',
                      header=1,
                      index=0,
                      sep='\t')

        if len(chrm_list) == 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '.txt',
                      header=1,
                      index=0,
                      sep='\t')

        # Save into .cool and .hdf5 files
        for resolution in resolution_list:
            fragments.saveCooler(filename=path + 'filtered_maps/' + cell_line +
                                 '/heatmap-' + chrm_list[0] + '-' +
                                 str(resolution / 1000) + 'K.cool',
                                 resolution=resolution)
Ejemplo n.º 15
0
def Generate_one_chromosome_file(chrNumb):

    o_file = base_out_folder + "fitHiC/i_files/" + base_filename + ".fithic"
    fragment_dataset_filename = base_out_folder + "fitHiC/i_files/" + 'fragment_dataset_' + base_filename + '_chr' + str(
        chrNumb) + '.hdf5'

    if not os.path.isfile(fragment_dataset_filename):
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='w')
        fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True)

        fragments.filterRsiteStart(offset=5)
        fragments.filterDuplicates()
        fragments.filterLarge()
        fragments.filterExtreme(cutH=0.005, cutL=0)
    else:
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='a')

    print "Filtering, leaving only chr ", genome_db.idx2label[chrNumb]
    #leave only frgaments from the chrNumb (nterchromosomal)
    fragments.maskFilter((fragments.chrms1 == chrNumb))
    fragments.maskFilter((fragments.chrms2 == chrNumb))

    print "Seting RE"
    #Setting info about restriction enzyme, calculating absolute indexes
    fragments.setRfragAbsIdxs('HindIII')

    numBins = len(fragments.genome.rsites[chrNumb])
    print "Total numBins (RSites) on chr ", genome_db.idx2label[
        chrNumb], " = ", numBins

    rfragAbsIdxs1 = fragments.rfragAbsIdxs1 - fragments.genome.chrmStartsRfragCont[
        chrNumb]
    rfragAbsIdxs2 = fragments.rfragAbsIdxs2 - fragments.genome.chrmStartsRfragCont[
        chrNumb]
    print "Total number of fragments = ", len(rfragAbsIdxs1)

    if len(rfragAbsIdxs1) != len(rfragAbsIdxs2):
        print "rfragAbsIdxs1=", rfragAbsIdxs1
        print "rfragAbsIdxs2=", rfragAbsIdxs2
        print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1)
        print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 1!!!"
    if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0):
        print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1)
        print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 2!!!"
    if (max(rfragAbsIdxs1) > numBins - 1 or max(rfragAbsIdxs2) > numBins - 1):
        print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1)
        print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2)
        print "numBins=", numBins
        raise "FRAGMENT INDEXING ERROR 3!!!"

    print "FRAGMENT INDEXING - passed"

    #Creating label array
    label = np.array(rfragAbsIdxs1, dtype='int64')
    label *= numBins
    label += rfragAbsIdxs2

    #Creating count array
    counts = np.bincount(label, minlength=numBins**2)
    counts.shape = (numBins, numBins)

    #Counting
    for i in xrange(len(counts)):
        counts[i, i:] += counts[i:, i]
        counts[i:, i] = counts[i, i:]

    #Filling diagonal reads

    #diag = np.diag(counts)
    #fillDiagonal(counts, diag/2)
    fillDiagonal(counts, 0)

    BinsToDescribe = np.zeros(
        numBins
    )  # Info about which RSites should be described in .fragments file later

    #	f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".contacts.zip","w")
    f_out = open(o_file + "_chr" + str(chrNumb) + ".contacts.zip", "w")
    print "Writing file ", o_file + "_chr" + str(chrNumb) + ".contacts.zip"
    for i in range(numBins - 1):
        for j in range(i + 1, numBins):
            if (counts[i, j] != 0):
                s = ""
                s += str(chrNumb) + "\t"
                s += str(fragments.genome.rfragMids[chrNumb][i]) + "\t"
                s += str(chrNumb) + "\t"
                s += str(fragments.genome.rfragMids[chrNumb][j]) + "\t"
                s += str(counts[i, j]) + "\n"
                f_out.write(s)
                BinsToDescribe[i] = 1
                BinsToDescribe[j] = 1

    f_out.close()

    #	f_out = gzip.open (o_file+"_chr"+str(chrNumb)+".fragments.zip","w")
    f_out = open(o_file + "_chr" + str(chrNumb) + ".fragments.zip", "w")
    print "Writing file ", o_file + "_chr" + str(chrNumb) + ".fragments.zip"

    for ind, val in enumerate(BinsToDescribe):
        if (val == 1):
            s = ""
            s += str(chrNumb) + "\t0\t"
            s += str(fragments.genome.rfragMids[chrNumb][ind]) + "\t"
            s += str(sum(counts[ind])) + "\t"
            s += "1\n"
            f_out.write(s)
    f_out.close()
Ejemplo n.º 16
0
def Generate_whole_genome_chromosome_file(
        mappability=1):  #TODO - use mappability, now it's always =1
    fragment_dataset_filename = base_folder + 'fragment_dataset_' + base_filename + '.hdf5'
    o_file = base_out_folder + "fitHiC/i_files/" + base_filename

    if not os.path.isfile(fragment_dataset_filename):
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='w')
        fragments.parseInputData(dictLike=maped_reads_filepath, removeSS=True)

        fragments.filterRsiteStart(offset=5)
        fragments.filterDuplicates()
        fragments.filterLarge()
        fragments.filterExtreme(cutH=0.005, cutL=0)
    else:
        print "fragment_dataset " + fragment_dataset_filename + " found.\n IMPORTANT: considering all requerd filters have been done"
        fragments = fragmentHiC.HiCdataset(filename=fragment_dataset_filename,
                                           genome=genome_db,
                                           maximumMoleculeLength=500,
                                           mode='a')

    print "Seting RE"
    #Setting info about restriction enzyme, calculating absolute indexes
    fragments.setRfragAbsIdxs('HindIII')

    rfragAbsIdxs1 = fragments.rfragAbsIdxs1
    rfragAbsIdxs2 = fragments.rfragAbsIdxs2
    print "Total number of fragments = ", len(rfragAbsIdxs1)

    if len(rfragAbsIdxs1) != len(rfragAbsIdxs2):
        print "rfragAbsIdxs1=", rfragAbsIdxs1
        print "rfragAbsIdxs2=", rfragAbsIdxs2
        print "len(rfragAbsIdxs1)=", len(rfragAbsIdxs1)
        print "len(rfragAbsIdxs2)=", len(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 1!!!"
    if (min(rfragAbsIdxs1) < 0 or min(rfragAbsIdxs2) < 0):
        print "min(rfragAbsIdxs1)=", min(rfragAbsIdxs1)
        print "min(rfragAbsIdxs2)=", min(rfragAbsIdxs2)
        raise "FRAGMENT INDEXING ERROR 2!!!"
    if (max(rfragAbsIdxs1) > fragments.genome.chrmEndsRfragCont[-1]
            or max(rfragAbsIdxs2) > fragments.genome.chrmEndsRfragCont[-1]):
        print "max (rfragAbsIdxs1)=", max(rfragAbsIdxs1)
        print "max (rfragAbsIdxs2)=", max(rfragAbsIdxs2)
        print "numBins=", fragments.genome.chrmEndsRfragCont[-1]
        raise "FRAGMENT INDEXING ERROR 3!!!"

    print "FRAGMENT INDEXING - passed"

    print "Initialyzing heatmap"
    max_rsites_number = max([len(i) for i in fragments.genome.rsites])
    heatmap = np.zeros(shape=(fragments.genome.chrmCount, max_rsites_number,
                              max_rsites_number),
                       dtype=np.uint16)
    print "Max numBins (RSites) in chromosome= ", max_rsites_number

    print "Creating chrRsites array"
    RsiteToChr = np.zeros(max_rsites_number * fragments.genome.chrmCount,
                          dtype=np.uint8)
    for i in range(0, fragments.genome.chrmCount):
        RsiteToChr[fragments.genome.chrmStartsRfragCont[i]:fragments.genome.
                   chrmEndsRfragCont[i]] = i

    print "Filling heatmap"
    BinsToDescribe = np.zeros(shape=(fragments.genome.chrmCount,
                                     max_rsites_number),
                              dtype=np.int8)

    l = len(rfragAbsIdxs1)
    for i in xrange(l):
        if (i % (l / 10)) == 0:
            print(i / (l / 10)), "0 %"

        ChrN1 = RsiteToChr[rfragAbsIdxs1[i]]
        ChrN2 = RsiteToChr[rfragAbsIdxs2[i]]
        if (ChrN1 == ChrN2):  #if it is intrachromosomal contact
            rfragAbsIdxs1_onChr = rfragAbsIdxs1[
                i] - fragments.genome.chrmStartsRfragCont[ChrN1]
            rfragAbsIdxs2_onChr = rfragAbsIdxs2[
                i] - fragments.genome.chrmStartsRfragCont[ChrN1]
            BinsToDescribe[ChrN1][rfragAbsIdxs1_onChr] = 1
            BinsToDescribe[ChrN2][rfragAbsIdxs2_onChr] = 1
            heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] += 1
            if heatmap[ChrN1][rfragAbsIdxs1_onChr][
                    rfragAbsIdxs2_onChr] >= 64000:
                raise "Type int16 used in heatmap is not compatible with N of contact >64000"

    f_out = open(o_file + ".contacts", "w")

    print "Total number of non-empty bins (rfrags)=", np.sum(BinsToDescribe)

    print "Writing file ", o_file + ".contacts"
    f_out = open(o_file + ".contacts", "w")
    for i in xrange(l):
        if (i % (l / 10)) == 0:
            print(i / (l / 10)), "0 %"

        ChrN1 = RsiteToChr[rfragAbsIdxs1[i]]
        ChrN2 = RsiteToChr[rfragAbsIdxs2[i]]

        if (ChrN1 == ChrN2):  #if it is intrachromosomal contact
            rfragAbsIdxs1_onChr = rfragAbsIdxs1[
                i] - fragments.genome.chrmStartsRfragCont[ChrN1]
            rfragAbsIdxs2_onChr = rfragAbsIdxs2[
                i] - fragments.genome.chrmStartsRfragCont[ChrN1]
            if (heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] !=
                    -1):
                s = ""
                s += str(i) + "\t"
                s += str(fragments.genome.rfragMids[ChrN1]
                         [rfragAbsIdxs1_onChr]) + "\t"
                s += str(i) + "\t"
                s += str(fragments.genome.rfragMids[ChrN2]
                         [rfragAbsIdxs2_onChr]) + "\t"
                s += str(heatmap[ChrN1, rfragAbsIdxs1_onChr,
                                 rfragAbsIdxs2_onChr]) + "\n"
                heatmap[ChrN1][rfragAbsIdxs1_onChr][rfragAbsIdxs2_onChr] = -1
                f_out.write(s)

    f_out.close()

    f_out = open(o_file + ".fragments", "w")

    print "Writing file ", o_file + ".fragments"
    for i in range(fragments.genome.chrmCount):
        for j in xrange(max_rsites_number):
            if (BinsToDescribe[i][j] == 1):
                s = ""
                chrNumb = i
                s += str(chrNumb) + "\t0\t"
                s += str(fragments.genome.rfragMids[i][j]) + "\t"
                s += str(sum(heatmap[i][j])) + "\t"
                s += "1\n"
                f_out.write(s)
    f_out.close()