def step2(hiclib_path, sraid, res=1000000): ''' 2. Filter the dataset at the restriction fragment level. http://mirnylab.bitbucket.org/hiclib/tutorial/02_fragment_filtering.html ''' from mirnylib import genome from hiclib import fragmentHiC # Create a HiCdataset object. genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) fragments = fragmentHiC.HiCdataset(filename=sraid + '_fragment_dataset.hdf5', genome=genome_db, maximumMoleculeLength=500, enzymeName='HindIII', mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData(dictLike=sraid + '_mapped_reads.hdf5') fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() if sraid in ["SRR071231", "SRR071232"]: ## set to 0.1% for TCC fragments.filterExtreme(cutH=0.001, cutL=0) else: ## default for Hi-C is 0.5% fragments.filterExtreme(cutH=0.005, cutL=0) # fragments.saveFragments() fragments.saveHeatmap(sraid + '_map-res%sk.hdf5' % (res / 1000), resolution=res)
def getGenome(name): if name in allGenomes: return allGenomes[name] if name == "hg19": genome_db = genome.Genome("../data/hg19") elif name == "hg18": genome_db = genome.Genome("../data/hg18") elif name == "mm9": genome_db = genome.Genome("../data/mm9") elif name == "mm10": genome_db = genome.Genome("../data/mm10") #You can also use genomes with only numbered and X chromosomes #genome_db = genome.Genome("../data/hg19", readChrms=["#","X"]) elif name == "dm3": #Drosophila (example of specifying exact chromosomal order) genome_db = genome.Genome("../data/dm3", readChrms=[ "2L", "2R", "3L", "3R", "4", "X", "2LHet", "2RHet", "3LHet", "3RHet", "XHet", "YHet", "U", "Uextra", "M" ], forceOrder=True) elif name == "cb10": genome_db = genome.Genome( "../data/cb10", readChrms=["I", "II", "III", "IV", "V", "X", "M"], forceOrder=True) else: raise ValueError( "Genome {0} not defined. Edit defineGenome.py and define it". format(name)) allGenomes[name] = genome_db return genome_db
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000)) plt.clf() outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) for eigenvector in eig_v: outfile.write("\t%s" % eigenvector[i]) outfile.write("\n") outfile.close()
def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme): if not os.path.exists(path + 'maps/' + cell_line): os.mkdir(path + 'maps/' + cell_line) for chrm_list in chromosome_names: if len(chrm_list) > 1: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_full.hdf5') else: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_' + chrm_list[0] + '.hdf5') genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True) mapping.parse_sam( sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam', sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam', out_dict = mapped_reads, genome_db = genome_db, enzyme_name = enzyme)
from mirnylib import genome from hiclib import fragmentHiC # Create a HiCdataset object. genome_db = genome.Genome('../fasta/mm9', readChrms=['#', 'X']) fragments = fragmentHiC.HiCdataset( filename='../../data/serov/fragment_dataset_Sp.hdf5', genome=genome_db, maximumMoleculeLength=500, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData( dictLike='../../data/serov/mapped_reads_Sp.hdf5') fragments.filterRsiteStart(offset=5) fragments.filterDuplicates() fragments.filterLarge() fragments.filterExtreme(cutH=0.005, cutL=0) fragments.saveHeatmap('../../data/serov/heatmap-res-1M_Sp.hdf5', resolution=1000000)
import gzip import math import subprocess from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData from hiclib import fragmentHiC from mirnylib.numutils import fillDiagonal domain_res='fragment' genome_name='mm9' genome_folder='/mnt/storage/home/vsfishman/HiC/fasta/' genome_db = genome.Genome(genome_folder+genome_name, readChrms=['#', 'X']) base_filename="Sp_full" base_folder='/mnt/storage/home/vsfishman/HiC/data/' maped_reads_filepath=base_folder+'mapped_reads_'+base_filename+'.hdf5' base_out_folder = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/data/" B = 5.0 c = 1.12 def DistancetoBinN(distance): # return math.ceil(math.log(distance/B,c))) return int(distance/B)
import sys import os from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5] in_prefices = sys.argv[5:] basedir = os.path.split(os.path.abspath(out_fname))[0] mapped_reads = [] for prefix in in_prefices: mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix))) genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa") for i, name in enumerate(mapped_reads): mapping.parse_sam(sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]), sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]), out_dict=name, genome_db=genome_db, enzyme_name=re_name) for i, name in enumerate(mapped_reads): fragments = fragmentHiC.HiCdataset(filename='temp', genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName=re_name,
#!/usr/bin/env python import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome('../Ref/hg19', readChrms=['#', 'X']) raw_heatmap = h5dict.h5dict('../2_filtering_reads/heatmap-res-1M.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad('../2_filtering_reads/heatmap-res-1M.hdf5', 'Rao2014_10M') #BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removePoorRegions(cutoff=1) BD.truncTrans(high=0.0005) BD.iterativeCorrectWithoutSS() BD.export('Rao2014_10M', './IC-heatmap-res-1M.hdf5') fig = plt.figure() plotting.plot_matrix(np.log(BD.dataDict['Rao2014_10M'] + 1.0)) fig.savefig('./heatmap.pdf') '''
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000)) plt.clf() # Save Bias outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) outfile.write("\t%s" % BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()
logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument("basename") parser.add_argument("chunkNumber") args = parser.parse_args() basename = args.basename chunk = args.chunkNumber print(basename) # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. reads_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_mapped_reads.hdf5' fragments_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_fragment_dataset.hdf5' mapped_reads = h5dict.h5dict(reads_file) genome_db = genome.Genome('../genomes/mm9/fasta', readChrms=['#','X']) def func(): mapping.parse_sam( sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk, sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk, out_dict=mapped_reads, genome_db=genome_db, enzyme_name='DpnII') fragments = fragmentHiC.HiCdataset( filename=fragments_file, genome=genome_db, maximumMoleculeLength=700, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the
inFastqDir = "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/MySeq/Control/K1/sample/" # for mode="fastq" only #sidePrefixes = ("side1", "side2") # a version for naming ....side1.fastq.gz sidePrefixes = ( "R1_001", "R2_001" ) # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2 # If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix. threads = 10 tmpDir = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/3/" # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not) # Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not. # Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can. genomeName = "pv11_scaffolds.v2" genome_db = genome.Genome( "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/genome/pv11_scaffolds.v2.fasta", readChrms=[], chrmFileTemplate="N%s.fa") bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2" bowtieIndex = "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered" bowtieFlags = "--very-sensitive" "IDs from GEO (SRR numbers)" #GEOids = list(range(1665087,1665096)) # Set this for for mapping .sra files # You can do it like this: # GEOids = range(1658523,1658540) + [398318, 398920,398921] #taken from an actual study seqSkipStart = 0 # skip first 2 bp of the read, if you want minMapLen = 18 # start mapping at this length # This will adjust iterative mapping automatically
def process(): global options global args global pp if (options.verbose): print >> sys.stdout, "*** START processing" fig = plt.figure() pp = PdfPages(options.outputDir + options.experiment + '.pdf') logging.basicConfig(level=logging.DEBUG) if (options.verbose): print >> sys.stdout, "** Create directories" if not os.path.exists(options.tmpDir): os.mkdir(options.tmpDir) if not os.path.exists(options.outputDir): os.mkdir(options.outputDir) if (options.verbose): print >> sys.stdout, "** Create data objects" mapped_reads = h5dict.h5dict(options.outputDir + options.experiment + '-mapped_reads.hdf5') genome_db = genome.Genome(options.genome, gapFile=options.gapFile, readChrms=['#', 'X', 'Y']) bams = [] if (options.inputFormat != 'bam'): bams = mapFiles() else: bams = args[0:] if (options.verbose): print >> sys.stdout, "** Collect mapped reads" collectMappedReads(bams[0], bams[1], mapped_reads, genome_db) if (options.verbose): print >> sys.stdout, "** Filter fragments" filterFragments(genome_db) if (options.verbose): print >> sys.stdout, "** Iterative filtering of fragments" iterativeFiltering(genome_db, '-1M.hdf5') iterativeFiltering(genome_db, '-200k.hdf5') # plotting correctedScalingPlot(200000, options.outputDir + options.experiment + '-200k.hdf5', options.experiment, genome_db) doArmPlot(1000000, options.outputDir + options.experiment + '-1M.hdf5', options.experiment, genome_db) if (options.verbose): print >> sys.stdout, "*** FINISHED processing" pp.close()
if os.path.exists(filename): os.remove(filename) genomeName = "mm10" threads = 10 bowtiePath = "../bin/bowtie2/bowtie2" if not os.path.exists(bowtiePath): raise fastqDir = "fastq" bowtieIndex = "../bin/bowtie2/index/{0}".format(genomeName) tmpDir = "/tmp" samFolder = "sams-{0}".format(genomeName) savePath = "mapped-{0}".format(genomeName) # Specify location of the genome files here genome_db = genome.Genome('../data/{0}'.format(genomeName), readChrms=["#", "X"]) if not os.path.exists(samFolder): os.mkdir(samFolder) if not os.path.exists(savePath): os.mkdir(savePath) def calculateStep(length, minlen, approxStep=10, maxSteps=4): """returns minimum length and step based on the length of sequence and proposed minimum length""" actualDif = length - minlen if actualDif < approxStep * 0.6: return length, 100
max_length = 0 cur_sum = 0 chr_size_list.append(cur_sum) for line in chr_size_file: line = line[:-1] pair = line.split('\t') size = int(pair[1]) / bin_size + 1 cur_sum = cur_sum + size chr_size_list.append(cur_sum) max_length = max(max_length, size) return max_length # initialize the file and retrieve the matrix from the file genome_db = genome.Genome( '/net/noble/vol1/data/reference_genomes/mm9/chromosomes', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict('~/2016ACCOST/data/1000000.either.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) data = binnedData.binnedData(resolution, genome_db) data.simpleLoad('~/2016ACCOST/data/1000000.either.hdf5', 'matrix') matrix = data.dataDict['matrix'] # find the row where the whole row are zeros zero_list = set() for x in range(0, len(matrix)): if sum(matrix[x]) == 0:
# -------------------- Parameter definitions ------------ inFastqDir = "fastq" # for mode="fastq" only sidePrefixes = ("side1", "side2") # a version for naming ....side1.fastq.gz # sidePrefixes = ("_R1_001","_R2_001") # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2 # If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix. threads = 7 tmpDir = "/tmp" # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not) # Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not. # Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can. "Initializing Genome object" # genome_db = genome.Genome("../data/dm3", readChrms=["2L", "2R", "3L", "3R", "4", "X", "2LHet","2RHet","3LHet","3RHet","XHet","YHet","U","Uextra","M"], forceOrder=True) #drosophila example genome_db = genome.Genome("../data/mm9", readChrms=["#", "X", "Y", "M"], cacheDir="tmpDir") genomeName = genome_db.folderName # automatically infer genome name from the folder name. Name is used for naming folders ("mapped-hg19", etc) bowtiePath = "../bin/bowtie2/bowtie2" bowtieIndex = "../bin/bowtie2/index/{0}".format( genomeName ) # change this if your index is named differently from the genome bowtieFlags = "--very-sensitive" "IDs from GEO (SRR numbers)" GEOids = [1658652] # a small file for testing purposes # Set this for for mapping .sra files # You can do it like this: # GEOids = range(1658523,1658540) + [398318, 398920,398921] #taken from an actual study
""" This scripts is a rip-off of a large mergeDatasets script with certain adjustments. Follow comments along the text. """ from hiclib.fragmentHiC import HiCdataset import os from mirnylib import genome genomeDb = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) for expName in os.listdir("caul"): TR = HiCdataset( "bla", genome=genomeDb, inMemory=True, ) # inMemory, as files are probably small (less than hundreds mililon reads) TR.parseInputData("caul/" + expName, removeSS=True) # We discard SS in our pipeline now TR.printMetadata() TR.filterRsiteStart( offset=5 ) # We still do this filter to avoid strange "dangling end-like" molecules TR.filterDuplicates() #TR.save(out_file+".dat") TR.filterLarge( cutlarge=300000, cutsmall=100
matplotlib.use('Agg') import matplotlib.pyplot as plt from lib_matrix_operations import * import os from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData from hiclib import fragmentHiC from mirnylib.h5dict import h5dict genome_name = 'mm9' #genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X']) #genome_name='hg19' genome_db = genome.Genome('../fasta/' + genome_name, readChrms=['M']) domain_res = 'fragment' base_folder = '/mnt/storage/home/vsfishman/HiC/data/' #base_filename = 'SRR443884_mESC_2' base_filename = 'Fib_full2_chrM' #raw="-raw" raw = "" #heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw heatmap_filepath = base_folder + 'heatmap-res-fragment_KB_' + base_filename + '.hdf5' + raw #figure_path=base_folder+base_filename+"_"+str(domain_res/1000)+'kb'+raw+'.eps' figure_path = base_folder + base_filename + '_fragment_kb' + raw + '.eps'
### Reading genome logging.info("Preparation of genome...") cmd_bgn_time = time.time() PATH_ABSOLUTE = os.path.dirname(os.path.realpath(__file__)) fasta_path = os.path.join( PATH_ABSOLUTE, "data/genome/hg19.fa" ) # Note that a single file with all chromosomes is preferred chrms = [ '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1', '20', '21', '22', '2', '3', '4', '5', '6', '7', '8', '9', 'M', 'X', 'Y' ] # Note the chromosomes order # For other genomes, you might want to use gap file, e.g. for hg38: http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/gap.txt.gz #genome_db = genome.Genome(fasta_path, readChrms=chrms, gapFile="gap.hg38.txt") # For hg19, this variant works perfectly with mirnylib.genome genome_db = genome.Genome(fasta_path, readChrms=chrms) enzymes = ['NlaIII', 'MmeI', 'DpnII', 'MseI'] # Applying restriction enzymes genome_db.setEnzyme('MmeI') strands = [[] for i in range(len(genome_db.seqs))] true_poss = [[] for i in range(len(genome_db.seqs))] for i in range(len(genome_db.seqs)): s = genome_db.seqs[i] for pos in genome_db.rsites[i]: # Check that the found position is not located at the end of chromosome: if pos + 16 > len(s) or pos + 22 > len( s) or pos - 28 < 0 or pos - 22 < 0: strand = -1 true_pos = -1
print "resolution defined by heatmap: ", domain_res return domain_res domain_res = get_domain_res(heatmap_filepath) print "Domain resolution = " + str(domain_res) base_filename = heatmap_filepath.split("/")[-1].replace("-", "_") #Define files and directories base_folder = '/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomains' HMM_file_path = '/mnt/storage/home/vsfishman/HiC/DI/domaincall_software/' DI_from_matrix_file_path = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/DI_from_matrix_minja.pl" genome_db = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", readChrms=[], chrmFileTemplate="%s.fna") genome_fai_filepath = genome_db.genomePath + '/GalGal5ChrmLevel.fai' def executeBashCommand(bashCommand, doPrint=True): print "executing command %s \n" % (bashCommand) p = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) if doPrint: print p.communicate()[0] return "" else: return p.communicate()[0]
#!/usr/bin/env python import sys from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome basedir = sys.argv[1] genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir, readChrms=['1'], chrmFileTemplate="%s.fa") fragments = fragmentHiC.HiCdataset(filename='temp1', genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName="NcoI", inMemory=True) fragments.load('%s/Data/Timing/hiclib_data.hdf5' % basedir) fragments.filterDuplicates() fragments.filterExtreme(cutH=0, cutL=0.005) fragments.save('%s/Data/Timing/hiclib_data_filt.hdf5' % basedir)
#inFastqDir = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/" # for mode="fastq" only #sidePrefixes = ("side1", "side2") # a version for naming ....side1.fastq.gz sidePrefixes = ( "R1_001", "R2_001" ) # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2 # If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix. threads = 4 tmpDir = "/mnt/storage/home/vsfishman/HiC_temp/" # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not) # Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not. # Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can. genomeName = "mm10" genome_db = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"], ) bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2" bowtieIndex = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/index" + "/{0}".format( genomeName ) # change this if your index is named differently from the genome bowtieFlags = "--very-sensitive" "IDs from GEO (SRR numbers)" #GEOids = list(range(1665087,1665096)) #GEOids = [400251,400252,400253,400254,400255,400256,443883,443884,443885,443886,443887,443888] #TODO 443883 #GEOids = [400253,443884,443885,443886,443887,443888] sra_folder = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/newESC/" #GEOids = [f.split("SRR")[-1].split(".")[0] for f in os.listdir(sra_folder) if f.endswith(".sra")]
print str(now) print "\n" import matplotlib.pyplot as plt import numpy as np import os import subprocess from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData from hiclib import fragmentHiC genome_name = 'mm9' genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X']) import argparse parser = argparse.ArgumentParser() parser.add_argument("--res") args = parser.parse_args() domain_res = args.res if (domain_res == None) or (domain_res == ""): domain_res = 100000 domain_res = int(domain_res) print "Domain resolution = " + str(domain_res)
def step1( hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid='SRR071231', readlen=40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2 * readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam', sam_basename2='../data/SRA/' + sraid + '_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
def process(): global options global args global pp outfilename = [] # check dataset exist for i in xrange(len(args)): if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'))): print '[ERROR] Could not find: ' + args[i].replace( '-fragment_dataset.hdf5', '-1M.hdf5') sys.exit(1) if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5', '-200k.hdf5'))): print '[ERROR] Could not find: ' + args[i].replace( '-fragment_dataset.hdf5', '-200k.hdf5') sys.exit(1) if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5', '-IC-1M.hdf5'))): print '[ERROR] Could not find: ' + args[i].replace( '-fragment_dataset.hdf5', '-IC-1M.hdf5') sys.exit(1) outfilename += [ "_".join( os.path.basename( args[i]).strip("-fragment_dataset.hdf5").split("_")[1:]) ] genome_db = genome.Genome(options.genome, gapFile=options.gapFile, readChrms=['#', 'X', 'Y']) outfilename = "-".join(outfilename) outfile = open(options.outputDir + outfilename + '-HiC_correlate.txt', "w") fig = plt.figure() pp = PdfPages(options.outputDir + outfilename + '-HiC_correlate.pdf') for i in xrange(len(args)): print " Process file " + str(i) + ":" + args[i] enzyme_i = os.path.basename(args[i]).split("_")[0] experiment_i = "_".join( os.path.basename( args[i]).strip("-fragment_dataset.hdf5").split("_")[1:]) for j in xrange(i + 1, len(args)): enzyme_j = os.path.basename(args[j]).split("_")[0] experiment_j = "_".join( os.path.basename( args[j]).strip("-fragment_dataset.hdf5").split("_")[1:]) compareCorrelationOfEigenvectors( 1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'), args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'), experiment_i, experiment_j, genome_db) calculateTanayCorrelation( 1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'), args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'), experiment_i, experiment_j, genome_db, outfile) plotDiagonalCorrelation( 200000, args[i].replace('-fragment_dataset.hdf5', '-200k.hdf5'), args[j].replace('-fragment_dataset.hdf5', '-200k.hdf5'), experiment_i, experiment_j, genome_db) compareInterarmMaps( 1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'), args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'), experiment_i, experiment_j, genome_db) if (options.verbose): print >> sys.stdout, "print plots into pdf:%s" % ( options.outputDir + outfilename + '-HiC_correlate.pdf') outfile.close() pp.close()
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from lib_matrix_operations import * import os from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData from hiclib import fragmentHiC #genome_name='mm9' #genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X']) genome_name='mm9' genome_db = genome.Genome('../fasta/'+genome_name, readChrms=["#","X","Y"]) domain_res=200000 base_folder='/mnt/storage/home/vsfishman/HiC/data/' #base_filename = 'SRR443884_mESC_2' base_filename = 'Sp_full2_Y' #base_filename = 'LA' raw="-raw" #raw="" heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw #heatmap_filepath=base_folder+'heatmap-res-'+str('fragment_')+'KB_'+base_filename+'.hdf5'+raw
import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mirnylib import genome from mirnylib import h5dict from hiclib import binnedData import numpy as np from mirnylib.systemutils import setExceptionHook setExceptionHook() ########define file names and other params #mirnylib genome params genomeName = "GalGal5filtered" genome_db = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/", readChrms=[], chrmFileTemplate="N%s.fa") #where to find hic lib heatmap basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/" filename = "chunk0001.hdf5.hm-res-1000kb" #resulting file name out_file = "all.glm" ###parameters requerd by LACHESIS to be in the header header_string = """# GenomeLinkMatrix file - see GenomeLinkMatrix.h for documentation of this object type # Species = chick # N_bins = 524 # bin_size = 0 # RE_sites_file = /mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered.fa.counts_AAGCTT.txt
def filtration(chromosome_names, cell_line, path, genome_version, enzyme, resolution_list): for chrm_list in chromosome_names: genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile='gap.txt', readChrms=chrm_list, forceOrder=True) # Read mapped reads if len(chrm_list) > 1: fragments = fragmentHiC.HiCdataset(filename=path + 'filtered_maps/' + cell_line + '/fragment_dataset_full.hdf5', genome=genome_db, enzymeName=enzyme, mode='w') fragments.parseInputData(dictLike=path + 'maps/' + cell_line + '/mapped_reads_full.hdf5') else: fragments = fragmentHiC.HiCdataset( filename=path + 'filtered_maps/' + cell_line + '/fragment_dataset_' + chrm_list[0] + '.hdf5', genome=genome_db, enzymeName=enzyme, mode='w') fragments.parseInputData(dictLike=path + 'maps/' + cell_line + '/mapped_reads_' + chrm_list[0] + '.hdf5') # Apply filters fragments.filterDuplicates() # Save statistics if len(chrm_list) > 1 or len(chromosome_names) == 1: fragments.writeFilteringStats() fragments.printMetadata(saveTo=path + 'processing_stats/' + cell_line + '/processing_stats_' + cell_line + '.txt') if len(chrm_list) == 1 and len(chromosome_names) > 1: fragments.writeFilteringStats() fragments.printMetadata(saveTo=path + 'processing_stats/' + cell_line + '/processing_stats_' + cell_line + '_' + chrm_list[0] + '.txt') # Sort reads and calculate contact probability (both normalized and not) fragments._sortData() if len(chrm_list) > 1: contact_probs = fragments.plotScaling(normalize=True, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_full_norm.txt', header=1, index=0, sep='\t') contact_probs = fragments.plotScaling(normalize=False, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_full.txt', header=1, index=0, sep='\t') if len(chrm_list) == 1: contact_probs = fragments.plotScaling(normalize=True, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_' + chrm_list[0] + '_norm.txt', header=1, index=0, sep='\t') contact_probs = fragments.plotScaling(normalize=False, plot=False) pd.DataFrame({ 'Distance': contact_probs[0], 'Probability': contact_probs[1] }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' + cell_line + '_' + chrm_list[0] + '.txt', header=1, index=0, sep='\t') # Save into .cool and .hdf5 files for resolution in resolution_list: fragments.saveCooler(filename=path + 'filtered_maps/' + cell_line + '/heatmap-' + chrm_list[0] + '-' + str(resolution / 1000) + 'K.cool', resolution=resolution)