Beispiel #1
0
def step2(hiclib_path, sraid, res=1000000):
    ''' 2. Filter the dataset at the restriction fragment level.
        http://mirnylab.bitbucket.org/hiclib/tutorial/02_fragment_filtering.html
    '''
    from mirnylib import genome
    from hiclib import fragmentHiC

    # Create a HiCdataset object.
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])
    fragments = fragmentHiC.HiCdataset(filename=sraid +
                                       '_fragment_dataset.hdf5',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       enzymeName='HindIII',
                                       mode='w')

    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the
    # object.
    fragments.parseInputData(dictLike=sraid + '_mapped_reads.hdf5')

    fragments.filterRsiteStart(offset=5)
    fragments.filterDuplicates()
    fragments.filterLarge()
    if sraid in ["SRR071231", "SRR071232"]:  ## set to 0.1% for TCC
        fragments.filterExtreme(cutH=0.001, cutL=0)
    else:  ## default for Hi-C is 0.5%
        fragments.filterExtreme(cutH=0.005, cutL=0)


#    fragments.saveFragments()
    fragments.saveHeatmap(sraid + '_map-res%sk.hdf5' % (res / 1000),
                          resolution=res)
Beispiel #2
0
def getGenome(name):
    if name in allGenomes:
        return allGenomes[name]
    if name == "hg19":
        genome_db = genome.Genome("../data/hg19")
    elif name == "hg18":
        genome_db = genome.Genome("../data/hg18")
    elif name == "mm9":
        genome_db = genome.Genome("../data/mm9")
    elif name == "mm10":
        genome_db = genome.Genome("../data/mm10")

    #You can also use genomes with only numbered and X chromosomes
    #genome_db = genome.Genome("../data/hg19", readChrms=["#","X"])
    elif name == "dm3":
        #Drosophila (example of specifying exact chromosomal order)
        genome_db = genome.Genome("../data/dm3",
                                  readChrms=[
                                      "2L", "2R", "3L", "3R", "4", "X",
                                      "2LHet", "2RHet", "3LHet", "3RHet",
                                      "XHet", "YHet", "U", "Uextra", "M"
                                  ],
                                  forceOrder=True)
    elif name == "cb10":
        genome_db = genome.Genome(
            "../data/cb10",
            readChrms=["I", "II", "III", "IV", "V", "X", "M"],
            forceOrder=True)
    else:
        raise ValueError(
            "Genome {0} not defined. Edit defineGenome.py and define it".
            format(name))
    allGenomes[name] = genome_db
    return genome_db
Beispiel #3
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()
Beispiel #4
0
def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme):

    if not os.path.exists(path + 'maps/' + cell_line):
        os.mkdir(path + 'maps/' + cell_line)

    for chrm_list in chromosome_names:

        if len(chrm_list) > 1:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_full.hdf5')
        else:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_' + chrm_list[0] + '.hdf5')
        
        genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True)

        mapping.parse_sam(
            sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam',
            sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam',
            out_dict = mapped_reads,
            genome_db = genome_db,
            enzyme_name = enzyme)
from mirnylib import genome
from hiclib import fragmentHiC 

# Create a HiCdataset object.
genome_db = genome.Genome('../fasta/mm9', readChrms=['#', 'X'])
fragments = fragmentHiC.HiCdataset(
    filename='../../data/serov/fragment_dataset_Sp.hdf5',
    genome=genome_db,
    maximumMoleculeLength=500,
    mode='w')

# Load the parsed reads into the HiCdataset. The dangling-end filter is applied
# at this stage, with maximumMoleculeLength specified at the initiation of the 
# object.
fragments.parseInputData(
    dictLike='../../data/serov/mapped_reads_Sp.hdf5')

fragments.filterRsiteStart(offset=5)
fragments.filterDuplicates()
fragments.filterLarge()
fragments.filterExtreme(cutH=0.005, cutL=0)

fragments.saveHeatmap('../../data/serov/heatmap-res-1M_Sp.hdf5', resolution=1000000)
Beispiel #6
0
import gzip

import math
import subprocess
from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC
from mirnylib.numutils import fillDiagonal

domain_res='fragment'

genome_name='mm9'
genome_folder='/mnt/storage/home/vsfishman/HiC/fasta/'
genome_db = genome.Genome(genome_folder+genome_name, readChrms=['#', 'X'])

base_filename="Sp_full"
base_folder='/mnt/storage/home/vsfishman/HiC/data/'

maped_reads_filepath=base_folder+'mapped_reads_'+base_filename+'.hdf5'

base_out_folder = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/data/"

B = 5.0
c = 1.12

def DistancetoBinN(distance):
#	return math.ceil(math.log(distance/B,c)))
	return int(distance/B)
Beispiel #7
0
import sys
import os

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5]
in_prefices = sys.argv[5:]
basedir = os.path.split(os.path.abspath(out_fname))[0]

mapped_reads = []
for prefix in in_prefices:
    mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix)))
genome_db = genome.Genome(fasta_dir,
                          readChrms=['#', 'X'],
                          chrmFileTemplate="%s.fa")

for i, name in enumerate(mapped_reads):
    mapping.parse_sam(sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]),
                      sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]),
                      out_dict=name,
                      genome_db=genome_db,
                      enzyme_name=re_name)

for i, name in enumerate(mapped_reads):
    fragments = fragmentHiC.HiCdataset(filename='temp',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       mode='w',
                                       enzymeName=re_name,
#!/usr/bin/env python

import matplotlib.pyplot as plt
import numpy as np

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData

genome_db = genome.Genome('../Ref/hg19', readChrms=['#', 'X'])

raw_heatmap = h5dict.h5dict('../2_filtering_reads/heatmap-res-1M.hdf5',
                            mode='r')
resolution = int(raw_heatmap['resolution'])

BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad('../2_filtering_reads/heatmap-res-1M.hdf5', 'Rao2014_10M')

#BD.removeDiagonal()
BD.removeBySequencedCount(0.5)
BD.removePoorRegions(cutoff=1)
BD.truncTrans(high=0.0005)
BD.iterativeCorrectWithoutSS()

BD.export('Rao2014_10M', './IC-heatmap-res-1M.hdf5')

fig = plt.figure()
plotting.plot_matrix(np.log(BD.dataDict['Rao2014_10M'] + 1.0))
fig.savefig('./heatmap.pdf')
'''
Beispiel #9
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        outfile.write("\t%s" % BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()
Beispiel #10
0
logging.basicConfig(level=logging.DEBUG)

parser = argparse.ArgumentParser()
parser.add_argument("basename")
parser.add_argument("chunkNumber")
args = parser.parse_args()
basename = args.basename
chunk = args.chunkNumber
print(basename)

# B. Parse the mapped sequences into a Python data structure,
#    assign the ultra-sonic fragments to restriction fragments.
reads_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_mapped_reads.hdf5'
fragments_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_fragment_dataset.hdf5'
mapped_reads = h5dict.h5dict(reads_file)
genome_db    = genome.Genome('../genomes/mm9/fasta', readChrms=['#','X'])
def func():
    mapping.parse_sam(
        sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk,
        sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk,
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='DpnII')
    fragments = fragmentHiC.HiCdataset(
        filename=fragments_file,
        genome=genome_db,
        maximumMoleculeLength=700,
        mode='w')
    
    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the 
Beispiel #11
0
inFastqDir = "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/MySeq/Control/K1/sample/"  # for mode="fastq" only

#sidePrefixes = ("side1", "side2")   # a version for naming ....side1.fastq.gz
sidePrefixes = (
    "R1_001", "R2_001"
)  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 10
tmpDir = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/3/"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

genomeName = "pv11_scaffolds.v2"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/genome/pv11_scaffolds.v2.fasta",
    readChrms=[],
    chrmFileTemplate="N%s.fa")

bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2"
bowtieIndex = "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered"
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
#GEOids = list(range(1665087,1665096))
# Set this for for mapping .sra files
# You can do it like this:
# GEOids = range(1658523,1658540) + [398318, 398920,398921]  #taken from an actual study

seqSkipStart = 0  # skip first 2 bp of the read, if you want
minMapLen = 18  # start mapping at this length
# This will adjust iterative mapping automatically
Beispiel #12
0
def process():
    global options
    global args
    global pp

    if (options.verbose):
        print >> sys.stdout, "*** START processing"

    fig = plt.figure()
    pp = PdfPages(options.outputDir + options.experiment + '.pdf')

    logging.basicConfig(level=logging.DEBUG)

    if (options.verbose):
        print >> sys.stdout, "**  Create directories"

    if not os.path.exists(options.tmpDir):
        os.mkdir(options.tmpDir)

    if not os.path.exists(options.outputDir):
        os.mkdir(options.outputDir)

    if (options.verbose):
        print >> sys.stdout, "**  Create data objects"

    mapped_reads = h5dict.h5dict(options.outputDir + options.experiment +
                                 '-mapped_reads.hdf5')
    genome_db = genome.Genome(options.genome,
                              gapFile=options.gapFile,
                              readChrms=['#', 'X', 'Y'])

    bams = []
    if (options.inputFormat != 'bam'):
        bams = mapFiles()
    else:
        bams = args[0:]

    if (options.verbose):
        print >> sys.stdout, "**  Collect mapped reads"

    collectMappedReads(bams[0], bams[1], mapped_reads, genome_db)

    if (options.verbose):
        print >> sys.stdout, "**  Filter fragments"

    filterFragments(genome_db)

    if (options.verbose):
        print >> sys.stdout, "**  Iterative filtering of fragments"

    iterativeFiltering(genome_db, '-1M.hdf5')
    iterativeFiltering(genome_db, '-200k.hdf5')

    # plotting
    correctedScalingPlot(200000,
                         options.outputDir + options.experiment + '-200k.hdf5',
                         options.experiment, genome_db)

    doArmPlot(1000000, options.outputDir + options.experiment + '-1M.hdf5',
              options.experiment, genome_db)

    if (options.verbose):
        print >> sys.stdout, "*** FINISHED processing"

    pp.close()
    if os.path.exists(filename):
        os.remove(filename)


genomeName = "mm10"
threads = 10
bowtiePath = "../bin/bowtie2/bowtie2"
if not os.path.exists(bowtiePath): raise
fastqDir = "fastq"
bowtieIndex = "../bin/bowtie2/index/{0}".format(genomeName)
tmpDir = "/tmp"
samFolder = "sams-{0}".format(genomeName)
savePath = "mapped-{0}".format(genomeName)

# Specify location of the genome files here
genome_db = genome.Genome('../data/{0}'.format(genomeName),
                          readChrms=["#", "X"])

if not os.path.exists(samFolder):
    os.mkdir(samFolder)

if not os.path.exists(savePath):
    os.mkdir(savePath)


def calculateStep(length, minlen, approxStep=10, maxSteps=4):
    """returns minimum length and step based on the
    length of sequence and proposed minimum length"""

    actualDif = length - minlen
    if actualDif < approxStep * 0.6:
        return length, 100
Beispiel #14
0
    max_length = 0
    cur_sum = 0
    chr_size_list.append(cur_sum)
    for line in chr_size_file:
        line = line[:-1]
        pair = line.split('\t')
        size = int(pair[1]) / bin_size + 1
        cur_sum = cur_sum + size
        chr_size_list.append(cur_sum)
        max_length = max(max_length, size)
    return max_length


# initialize the file and retrieve the matrix from the file
genome_db = genome.Genome(
    '/net/noble/vol1/data/reference_genomes/mm9/chromosomes',
    readChrms=['#', 'X'])

# Read resolution from the dataset.
raw_heatmap = h5dict.h5dict('~/2016ACCOST/data/1000000.either.hdf5', mode='r')
resolution = int(raw_heatmap['resolution'])

data = binnedData.binnedData(resolution, genome_db)
data.simpleLoad('~/2016ACCOST/data/1000000.either.hdf5', 'matrix')

matrix = data.dataDict['matrix']

# find the row where the whole row are zeros
zero_list = set()
for x in range(0, len(matrix)):
    if sum(matrix[x]) == 0:
Beispiel #15
0
# -------------------- Parameter definitions ------------
inFastqDir = "fastq"  # for mode="fastq" only

sidePrefixes = ("side1", "side2")  # a version for naming ....side1.fastq.gz
# sidePrefixes = ("_R1_001","_R2_001")  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 7
tmpDir = "/tmp"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

"Initializing Genome object"
# genome_db = genome.Genome("../data/dm3", readChrms=["2L", "2R", "3L", "3R", "4", "X", "2LHet","2RHet","3LHet","3RHet","XHet","YHet","U","Uextra","M"], forceOrder=True)  #drosophila example
genome_db = genome.Genome("../data/mm9",
                          readChrms=["#", "X", "Y", "M"],
                          cacheDir="tmpDir")
genomeName = genome_db.folderName  # automatically infer genome name from the folder name. Name is used for naming folders ("mapped-hg19", etc)

bowtiePath = "../bin/bowtie2/bowtie2"
bowtieIndex = "../bin/bowtie2/index/{0}".format(
    genomeName
)  # change this if your index is named differently from the genome
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
GEOids = [1658652]  # a small file for testing purposes
# Set this for for mapping .sra files
# You can do it like this:
# GEOids = range(1658523,1658540) + [398318, 398920,398921]  #taken from an actual study
Beispiel #16
0
"""
This scripts is a rip-off of a large mergeDatasets script with certain adjustments.
Follow comments along the text.
"""

from hiclib.fragmentHiC import HiCdataset
import os

from mirnylib import genome

genomeDb = genome.Genome('../data/caul',
                         chrmFileTemplate="%s.fa",
                         readChrms=[])

for expName in os.listdir("caul"):

    TR = HiCdataset(
        "bla",
        genome=genomeDb,
        inMemory=True,
    )  # inMemory, as files are probably small (less than hundreds mililon reads)
    TR.parseInputData("caul/" + expName,
                      removeSS=True)  # We discard SS in our pipeline now
    TR.printMetadata()
    TR.filterRsiteStart(
        offset=5
    )  # We still do this filter to avoid strange "dangling end-like" molecules
    TR.filterDuplicates()
    #TR.save(out_file+".dat")
    TR.filterLarge(
        cutlarge=300000, cutsmall=100
Beispiel #17
0
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from lib_matrix_operations import *
import os

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC
from mirnylib.h5dict import h5dict

genome_name = 'mm9'
#genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X'])
#genome_name='hg19'
genome_db = genome.Genome('../fasta/' + genome_name, readChrms=['M'])

domain_res = 'fragment'

base_folder = '/mnt/storage/home/vsfishman/HiC/data/'
#base_filename = 'SRR443884_mESC_2'
base_filename = 'Fib_full2_chrM'

#raw="-raw"
raw = ""

#heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw
heatmap_filepath = base_folder + 'heatmap-res-fragment_KB_' + base_filename + '.hdf5' + raw

#figure_path=base_folder+base_filename+"_"+str(domain_res/1000)+'kb'+raw+'.eps'
figure_path = base_folder + base_filename + '_fragment_kb' + raw + '.eps'
Beispiel #18
0
### Reading genome
logging.info("Preparation of genome...")
cmd_bgn_time = time.time()

PATH_ABSOLUTE = os.path.dirname(os.path.realpath(__file__))
fasta_path = os.path.join(
    PATH_ABSOLUTE, "data/genome/hg19.fa"
)  # Note that a single file with all chromosomes is preferred
chrms = [
    '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1', '20',
    '21', '22', '2', '3', '4', '5', '6', '7', '8', '9', 'M', 'X', 'Y'
]  # Note the chromosomes order
# For other genomes, you might want to use gap file, e.g. for hg38: http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/gap.txt.gz
#genome_db    = genome.Genome(fasta_path, readChrms=chrms, gapFile="gap.hg38.txt")
# For hg19, this variant works perfectly with mirnylib.genome
genome_db = genome.Genome(fasta_path, readChrms=chrms)
enzymes = ['NlaIII', 'MmeI', 'DpnII', 'MseI']

# Applying restriction enzymes
genome_db.setEnzyme('MmeI')

strands = [[] for i in range(len(genome_db.seqs))]
true_poss = [[] for i in range(len(genome_db.seqs))]
for i in range(len(genome_db.seqs)):
    s = genome_db.seqs[i]
    for pos in genome_db.rsites[i]:
        # Check that the found position is not located at the end of chromosome:
        if pos + 16 > len(s) or pos + 22 > len(
                s) or pos - 28 < 0 or pos - 22 < 0:
            strand = -1
            true_pos = -1
Beispiel #19
0
    print "resolution defined by heatmap: ", domain_res
    return domain_res


domain_res = get_domain_res(heatmap_filepath)

print "Domain resolution = " + str(domain_res)
base_filename = heatmap_filepath.split("/")[-1].replace("-", "_")

#Define files and directories
base_folder = '/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomains'
HMM_file_path = '/mnt/storage/home/vsfishman/HiC/DI/domaincall_software/'
DI_from_matrix_file_path = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/DI_from_matrix_minja.pl"

genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    readChrms=[],
    chrmFileTemplate="%s.fna")

genome_fai_filepath = genome_db.genomePath + '/GalGal5ChrmLevel.fai'


def executeBashCommand(bashCommand, doPrint=True):
    print "executing command %s \n" % (bashCommand)
    p = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    if doPrint:
        print p.communicate()[0]
        return ""
    else:
        return p.communicate()[0]

Beispiel #20
0
#!/usr/bin/env python

import sys

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

basedir = sys.argv[1]
genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir,
                          readChrms=['1'],
                          chrmFileTemplate="%s.fa")
fragments = fragmentHiC.HiCdataset(filename='temp1',
                                   genome=genome_db,
                                   maximumMoleculeLength=500,
                                   mode='w',
                                   enzymeName="NcoI",
                                   inMemory=True)
fragments.load('%s/Data/Timing/hiclib_data.hdf5' % basedir)
fragments.filterDuplicates()
fragments.filterExtreme(cutH=0, cutL=0.005)
fragments.save('%s/Data/Timing/hiclib_data_filt.hdf5' % basedir)
Beispiel #21
0
#inFastqDir = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/"  # for mode="fastq" only

#sidePrefixes = ("side1", "side2")   # a version for naming ....side1.fastq.gz
sidePrefixes = (
    "R1_001", "R2_001"
)  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 4
tmpDir = "/mnt/storage/home/vsfishman/HiC_temp/"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

genomeName = "mm10"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
    readChrms=["#", "X", "Y"],
)

bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2"
bowtieIndex = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/index" + "/{0}".format(
    genomeName
)  # change this if your index is named differently from the genome
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
#GEOids = list(range(1665087,1665096))
#GEOids = [400251,400252,400253,400254,400255,400256,443883,443884,443885,443886,443887,443888]
#TODO 443883
#GEOids = [400253,443884,443885,443886,443887,443888]
sra_folder = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/newESC/"
#GEOids = [f.split("SRR")[-1].split(".")[0] for f in os.listdir(sra_folder) if f.endswith(".sra")]
Beispiel #22
0
print str(now)
print "\n"

import matplotlib.pyplot as plt
import numpy as np
import os
import subprocess

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC

genome_name = 'mm9'
genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X'])

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--res")
args = parser.parse_args()

domain_res = args.res

if (domain_res == None) or (domain_res == ""):
    domain_res = 100000

domain_res = int(domain_res)

print "Domain resolution = " + str(domain_res)
Beispiel #23
0
def step1(
        hiclib_path,  ## the path of hiclib folder on machine
        dataset='Kalhor2012NB',
        sraid='SRR071231',
        readlen=40):  ## each read with length 40
    ''' 1. Map reads to the genome
        http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html
    '''

    ## Adopted from hiclib tutorial
    import os
    import logging
    from hiclib import mapping
    from mirnylib import h5dict, genome

    logging.basicConfig(level=logging.DEBUG)

    # A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_1.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=0,
        seq_end=readlen,
        nthreads=12,  # on intel corei7 CPUs 4 threads are as fast as
        # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='../data/SRA/',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_2.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=readlen,
        seq_end=2 * readlen,
        nthreads=12,
        #max_reads_per_chunk = 10000000,
        temp_dir='../data/SRA/',
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict(sraid +
                                 '_mapped_reads.hdf5')  ## to local folder
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam',
                      sam_basename2='../data/SRA/' + sraid + '_2.bam',
                      out_dict=mapped_reads,
                      genome_db=genome_db,
                      enzyme_name='HindIII')
Beispiel #24
0
def process():
    global options
    global args
    global pp

    outfilename = []
    # check dataset exist
    for i in xrange(len(args)):
        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-1M.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-1M.hdf5')
            sys.exit(1)

        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-200k.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-200k.hdf5')
            sys.exit(1)

        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-IC-1M.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-IC-1M.hdf5')
            sys.exit(1)
        outfilename += [
            "_".join(
                os.path.basename(
                    args[i]).strip("-fragment_dataset.hdf5").split("_")[1:])
        ]

    genome_db = genome.Genome(options.genome,
                              gapFile=options.gapFile,
                              readChrms=['#', 'X', 'Y'])

    outfilename = "-".join(outfilename)
    outfile = open(options.outputDir + outfilename + '-HiC_correlate.txt', "w")
    fig = plt.figure()
    pp = PdfPages(options.outputDir + outfilename + '-HiC_correlate.pdf')

    for i in xrange(len(args)):
        print " Process file " + str(i) + ":" + args[i]
        enzyme_i = os.path.basename(args[i]).split("_")[0]
        experiment_i = "_".join(
            os.path.basename(
                args[i]).strip("-fragment_dataset.hdf5").split("_")[1:])
        for j in xrange(i + 1, len(args)):
            enzyme_j = os.path.basename(args[j]).split("_")[0]
            experiment_j = "_".join(
                os.path.basename(
                    args[j]).strip("-fragment_dataset.hdf5").split("_")[1:])

            compareCorrelationOfEigenvectors(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db)

            calculateTanayCorrelation(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db, outfile)

            plotDiagonalCorrelation(
                200000, args[i].replace('-fragment_dataset.hdf5',
                                        '-200k.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-200k.hdf5'),
                experiment_i, experiment_j, genome_db)

            compareInterarmMaps(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db)

    if (options.verbose):
        print >> sys.stdout, "print plots into pdf:%s" % (
            options.outputDir + outfilename + '-HiC_correlate.pdf')
    outfile.close()
    pp.close()
Beispiel #25
0
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from lib_matrix_operations import *
import os

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC

#genome_name='mm9'
#genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X'])
genome_name='mm9'
genome_db = genome.Genome('../fasta/'+genome_name, readChrms=["#","X","Y"])

domain_res=200000

base_folder='/mnt/storage/home/vsfishman/HiC/data/'
#base_filename = 'SRR443884_mESC_2'
base_filename = 'Sp_full2_Y'
#base_filename = 'LA'


raw="-raw"
#raw=""

heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw
#heatmap_filepath=base_folder+'heatmap-res-'+str('fragment_')+'KB_'+base_filename+'.hdf5'+raw
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from mirnylib import genome
from mirnylib import h5dict
from hiclib import binnedData
import numpy as np
from mirnylib.systemutils import setExceptionHook

setExceptionHook()

########define file names and other params
#mirnylib genome params
genomeName = "GalGal5filtered"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
    readChrms=[],
    chrmFileTemplate="N%s.fa")

#where to find hic lib heatmap
basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/"
filename = "chunk0001.hdf5.hm-res-1000kb"

#resulting file name
out_file = "all.glm"

###parameters requerd by LACHESIS to be in the header
header_string = """# GenomeLinkMatrix file - see GenomeLinkMatrix.h for documentation of this object type
# Species = chick
# N_bins = 524
# bin_size = 0
# RE_sites_file = /mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered.fa.counts_AAGCTT.txt
def filtration(chromosome_names, cell_line, path, genome_version, enzyme,
               resolution_list):
    for chrm_list in chromosome_names:
        genome_db = genome.Genome('/home/magnitov/data/genomes/' +
                                  genome_version,
                                  gapFile='gap.txt',
                                  readChrms=chrm_list,
                                  forceOrder=True)
        # Read mapped reads
        if len(chrm_list) > 1:
            fragments = fragmentHiC.HiCdataset(filename=path +
                                               'filtered_maps/' + cell_line +
                                               '/fragment_dataset_full.hdf5',
                                               genome=genome_db,
                                               enzymeName=enzyme,
                                               mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_full.hdf5')
        else:
            fragments = fragmentHiC.HiCdataset(
                filename=path + 'filtered_maps/' + cell_line +
                '/fragment_dataset_' + chrm_list[0] + '.hdf5',
                genome=genome_db,
                enzymeName=enzyme,
                mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_' + chrm_list[0] + '.hdf5')
        # Apply filters
        fragments.filterDuplicates()
        # Save statistics
        if len(chrm_list) > 1 or len(chromosome_names) == 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '.txt')
        if len(chrm_list) == 1 and len(chromosome_names) > 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '_' + chrm_list[0] + '.txt')

        # Sort reads and calculate contact probability (both normalized and not)
        fragments._sortData()
        if len(chrm_list) > 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full.txt',
                      header=1,
                      index=0,
                      sep='\t')

        if len(chrm_list) == 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '.txt',
                      header=1,
                      index=0,
                      sep='\t')

        # Save into .cool and .hdf5 files
        for resolution in resolution_list:
            fragments.saveCooler(filename=path + 'filtered_maps/' + cell_line +
                                 '/heatmap-' + chrm_list[0] + '-' +
                                 str(resolution / 1000) + 'K.cool',
                                 resolution=resolution)