Python Genome Beispiele, mirnylib.genome.Genome Python Beispiele

Beispiel #1

0

Datei anzeigen

def step2(hiclib_path, sraid, res=1000000):
    ''' 2. Filter the dataset at the restriction fragment level.
        http://mirnylab.bitbucket.org/hiclib/tutorial/02_fragment_filtering.html
    '''
    from mirnylib import genome
    from hiclib import fragmentHiC

    # Create a HiCdataset object.
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])
    fragments = fragmentHiC.HiCdataset(filename=sraid +
                                       '_fragment_dataset.hdf5',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       enzymeName='HindIII',
                                       mode='w')

    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the
    # object.
    fragments.parseInputData(dictLike=sraid + '_mapped_reads.hdf5')

    fragments.filterRsiteStart(offset=5)
    fragments.filterDuplicates()
    fragments.filterLarge()
    if sraid in ["SRR071231", "SRR071232"]:  ## set to 0.1% for TCC
        fragments.filterExtreme(cutH=0.001, cutL=0)
    else:  ## default for Hi-C is 0.5%
        fragments.filterExtreme(cutH=0.005, cutL=0)


#    fragments.saveFragments()
    fragments.saveHeatmap(sraid + '_map-res%sk.hdf5' % (res / 1000),
                          resolution=res)

Beispiel #2

0

Datei anzeigen

def getGenome(name):
    if name in allGenomes:
        return allGenomes[name]
    if name == "hg19":
        genome_db = genome.Genome("../data/hg19")
    elif name == "hg18":
        genome_db = genome.Genome("../data/hg18")
    elif name == "mm9":
        genome_db = genome.Genome("../data/mm9")
    elif name == "mm10":
        genome_db = genome.Genome("../data/mm10")

    #You can also use genomes with only numbered and X chromosomes
    #genome_db = genome.Genome("../data/hg19", readChrms=["#","X"])
    elif name == "dm3":
        #Drosophila (example of specifying exact chromosomal order)
        genome_db = genome.Genome("../data/dm3",
                                  readChrms=[
                                      "2L", "2R", "3L", "3R", "4", "X",
                                      "2LHet", "2RHet", "3LHet", "3RHet",
                                      "XHet", "YHet", "U", "Uextra", "M"
                                  ],
                                  forceOrder=True)
    elif name == "cb10":
        genome_db = genome.Genome(
            "../data/cb10",
            readChrms=["I", "II", "III", "IV", "V", "X", "M"],
            forceOrder=True)
    else:
        raise ValueError(
            "Genome {0} not defined. Edit defineGenome.py and define it".
            format(name))
    allGenomes[name] = genome_db
    return genome_db

Beispiel #3

0

Datei anzeigen

def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()

Beispiel #4

0

Datei anzeigen

def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme):

    if not os.path.exists(path + 'maps/' + cell_line):
        os.mkdir(path + 'maps/' + cell_line)

    for chrm_list in chromosome_names:

        if len(chrm_list) > 1:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_full.hdf5')
        else:
            mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line +  '/mapped_reads_' + chrm_list[0] + '.hdf5')
        
        genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True)

        mapping.parse_sam(
            sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam',
            sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam',
            out_dict = mapped_reads,
            genome_db = genome_db,
            enzyme_name = enzyme)

Beispiel #5

0

Datei anzeigen

Datei: 02_fragment_filtering_Sp.py Projekt: labdevgen/FishHiC

from mirnylib import genome
from hiclib import fragmentHiC 

# Create a HiCdataset object.
genome_db = genome.Genome('../fasta/mm9', readChrms=['#', 'X'])
fragments = fragmentHiC.HiCdataset(
    filename='../../data/serov/fragment_dataset_Sp.hdf5',
    genome=genome_db,
    maximumMoleculeLength=500,
    mode='w')

# Load the parsed reads into the HiCdataset. The dangling-end filter is applied
# at this stage, with maximumMoleculeLength specified at the initiation of the 
# object.
fragments.parseInputData(
    dictLike='../../data/serov/mapped_reads_Sp.hdf5')

fragments.filterRsiteStart(offset=5)
fragments.filterDuplicates()
fragments.filterLarge()
fragments.filterExtreme(cutH=0.005, cutL=0)

fragments.saveHeatmap('../../data/serov/heatmap-res-1M_Sp.hdf5', resolution=1000000)

Beispiel #6

0

Datei anzeigen

import gzip

import math
import subprocess
from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC
from mirnylib.numutils import fillDiagonal

domain_res='fragment'

genome_name='mm9'
genome_folder='/mnt/storage/home/vsfishman/HiC/fasta/'
genome_db = genome.Genome(genome_folder+genome_name, readChrms=['#', 'X'])

base_filename="Sp_full"
base_folder='/mnt/storage/home/vsfishman/HiC/data/'

maped_reads_filepath=base_folder+'mapped_reads_'+base_filename+'.hdf5'

base_out_folder = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/data/"

B = 5.0
c = 1.12

def DistancetoBinN(distance):
#	return math.ceil(math.log(distance/B,c)))
	return int(distance/B)

Beispiel #7

0

Datei anzeigen

import sys
import os

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5]
in_prefices = sys.argv[5:]
basedir = os.path.split(os.path.abspath(out_fname))[0]

mapped_reads = []
for prefix in in_prefices:
    mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix)))
genome_db = genome.Genome(fasta_dir,
                          readChrms=['#', 'X'],
                          chrmFileTemplate="%s.fa")

for i, name in enumerate(mapped_reads):
    mapping.parse_sam(sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]),
                      sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]),
                      out_dict=name,
                      genome_db=genome_db,
                      enzyme_name=re_name)

for i, name in enumerate(mapped_reads):
    fragments = fragmentHiC.HiCdataset(filename='temp',
                                       genome=genome_db,
                                       maximumMoleculeLength=500,
                                       mode='w',
                                       enzymeName=re_name,

Beispiel #8

0

Datei anzeigen

Datei: normalize.py Projekt: khigashi1987/NGSHandson2017

#!/usr/bin/env python

import matplotlib.pyplot as plt
import numpy as np

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData

genome_db = genome.Genome('../Ref/hg19', readChrms=['#', 'X'])

raw_heatmap = h5dict.h5dict('../2_filtering_reads/heatmap-res-1M.hdf5',
                            mode='r')
resolution = int(raw_heatmap['resolution'])

BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad('../2_filtering_reads/heatmap-res-1M.hdf5', 'Rao2014_10M')

#BD.removeDiagonal()
BD.removeBySequencedCount(0.5)
BD.removePoorRegions(cutoff=1)
BD.truncTrans(high=0.0005)
BD.iterativeCorrectWithoutSS()

BD.export('Rao2014_10M', './IC-heatmap-res-1M.hdf5')

fig = plt.figure()
plotting.plot_matrix(np.log(BD.dataDict['Rao2014_10M'] + 1.0))
fig.savefig('./heatmap.pdf')
'''

Beispiel #9

0

Datei anzeigen

def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        outfile.write("\t%s" % BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()

Beispiel #10

0

Datei anzeigen

Datei: 02_2_parse_sams.py Projekt: Phlya/Hi-CSGE

logging.basicConfig(level=logging.DEBUG)

parser = argparse.ArgumentParser()
parser.add_argument("basename")
parser.add_argument("chunkNumber")
args = parser.parse_args()
basename = args.basename
chunk = args.chunkNumber
print(basename)

# B. Parse the mapped sequences into a Python data structure,
#    assign the ultra-sonic fragments to restriction fragments.
reads_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_mapped_reads.hdf5'
fragments_file = '/exports/eddie/scratch/s1529682/processed/'+basename+'_'+chunk+'_fragment_dataset.hdf5'
mapped_reads = h5dict.h5dict(reads_file)
genome_db    = genome.Genome('../genomes/mm9/fasta', readChrms=['#','X'])
def func():
    mapping.parse_sam(
        sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk,
        sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk,
        out_dict=mapped_reads,
        genome_db=genome_db, 
        enzyme_name='DpnII')
    fragments = fragmentHiC.HiCdataset(
        filename=fragments_file,
        genome=genome_db,
        maximumMoleculeLength=700,
        mode='w')
    
    # Load the parsed reads into the HiCdataset. The dangling-end filter is applied
    # at this stage, with maximumMoleculeLength specified at the initiation of the

Beispiel #11

0

Datei anzeigen

Datei: 01_mapData.py Projekt: labdevgen/FishHiC

inFastqDir = "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/MySeq/Control/K1/sample/"  # for mode="fastq" only

#sidePrefixes = ("side1", "side2")   # a version for naming ....side1.fastq.gz
sidePrefixes = (
    "R1_001", "R2_001"
)  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 10
tmpDir = "/mnt/storage/home/vsfishman/tmp/HiC_tmp/3/"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

genomeName = "pv11_scaffolds.v2"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/tmp/HiC_polipedium/genome/pv11_scaffolds.v2.fasta",
    readChrms=[],
    chrmFileTemplate="N%s.fa")

bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2"
bowtieIndex = "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered"
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
#GEOids = list(range(1665087,1665096))
# Set this for for mapping .sra files
# You can do it like this:
# GEOids = range(1658523,1658540) + [398318, 398920,398921]  #taken from an actual study

seqSkipStart = 0  # skip first 2 bp of the read, if you want
minMapLen = 18  # start mapping at this length
# This will adjust iterative mapping automatically

Beispiel #12

0

Datei anzeigen

def process():
    global options
    global args
    global pp

    if (options.verbose):
        print >> sys.stdout, "*** START processing"

    fig = plt.figure()
    pp = PdfPages(options.outputDir + options.experiment + '.pdf')

    logging.basicConfig(level=logging.DEBUG)

    if (options.verbose):
        print >> sys.stdout, "**  Create directories"

    if not os.path.exists(options.tmpDir):
        os.mkdir(options.tmpDir)

    if not os.path.exists(options.outputDir):
        os.mkdir(options.outputDir)

    if (options.verbose):
        print >> sys.stdout, "**  Create data objects"

    mapped_reads = h5dict.h5dict(options.outputDir + options.experiment +
                                 '-mapped_reads.hdf5')
    genome_db = genome.Genome(options.genome,
                              gapFile=options.gapFile,
                              readChrms=['#', 'X', 'Y'])

    bams = []
    if (options.inputFormat != 'bam'):
        bams = mapFiles()
    else:
        bams = args[0:]

    if (options.verbose):
        print >> sys.stdout, "**  Collect mapped reads"

    collectMappedReads(bams[0], bams[1], mapped_reads, genome_db)

    if (options.verbose):
        print >> sys.stdout, "**  Filter fragments"

    filterFragments(genome_db)

    if (options.verbose):
        print >> sys.stdout, "**  Iterative filtering of fragments"

    iterativeFiltering(genome_db, '-1M.hdf5')
    iterativeFiltering(genome_db, '-200k.hdf5')

    # plotting
    correctedScalingPlot(200000,
                         options.outputDir + options.experiment + '-200k.hdf5',
                         options.experiment, genome_db)

    doArmPlot(1000000, options.outputDir + options.experiment + '-1M.hdf5',
              options.experiment, genome_db)

    if (options.verbose):
        print >> sys.stdout, "*** FINISHED processing"

    pp.close()

Beispiel #13

0

Datei anzeigen

Datei: 01_iterative_mapping.py Projekt: bxlab/HiFive_Paper

    if os.path.exists(filename):
        os.remove(filename)


genomeName = "mm10"
threads = 10
bowtiePath = "../bin/bowtie2/bowtie2"
if not os.path.exists(bowtiePath): raise
fastqDir = "fastq"
bowtieIndex = "../bin/bowtie2/index/{0}".format(genomeName)
tmpDir = "/tmp"
samFolder = "sams-{0}".format(genomeName)
savePath = "mapped-{0}".format(genomeName)

# Specify location of the genome files here
genome_db = genome.Genome('../data/{0}'.format(genomeName),
                          readChrms=["#", "X"])

if not os.path.exists(samFolder):
    os.mkdir(samFolder)

if not os.path.exists(savePath):
    os.mkdir(savePath)


def calculateStep(length, minlen, approxStep=10, maxSteps=4):
    """returns minimum length and step based on the
    length of sequence and proposed minimum length"""

    actualDif = length - minlen
    if actualDif < approxStep * 0.6:
        return length, 100

Beispiel #14

0

Datei anzeigen

Datei: read_hic.py Projekt: Linhua-Sun/ACCOST

    max_length = 0
    cur_sum = 0
    chr_size_list.append(cur_sum)
    for line in chr_size_file:
        line = line[:-1]
        pair = line.split('\t')
        size = int(pair[1]) / bin_size + 1
        cur_sum = cur_sum + size
        chr_size_list.append(cur_sum)
        max_length = max(max_length, size)
    return max_length


# initialize the file and retrieve the matrix from the file
genome_db = genome.Genome(
    '/net/noble/vol1/data/reference_genomes/mm9/chromosomes',
    readChrms=['#', 'X'])

# Read resolution from the dataset.
raw_heatmap = h5dict.h5dict('~/2016ACCOST/data/1000000.either.hdf5', mode='r')
resolution = int(raw_heatmap['resolution'])

data = binnedData.binnedData(resolution, genome_db)
data.simpleLoad('~/2016ACCOST/data/1000000.either.hdf5', 'matrix')

matrix = data.dataDict['matrix']

# find the row where the whole row are zeros
zero_list = set()
for x in range(0, len(matrix)):
    if sum(matrix[x]) == 0:

Beispiel #15

0

Datei anzeigen

# -------------------- Parameter definitions ------------
inFastqDir = "fastq"  # for mode="fastq" only

sidePrefixes = ("side1", "side2")  # a version for naming ....side1.fastq.gz
# sidePrefixes = ("_R1_001","_R2_001")  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 7
tmpDir = "/tmp"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

"Initializing Genome object"
# genome_db = genome.Genome("../data/dm3", readChrms=["2L", "2R", "3L", "3R", "4", "X", "2LHet","2RHet","3LHet","3RHet","XHet","YHet","U","Uextra","M"], forceOrder=True)  #drosophila example
genome_db = genome.Genome("../data/mm9",
                          readChrms=["#", "X", "Y", "M"],
                          cacheDir="tmpDir")
genomeName = genome_db.folderName  # automatically infer genome name from the folder name. Name is used for naming folders ("mapped-hg19", etc)

bowtiePath = "../bin/bowtie2/bowtie2"
bowtieIndex = "../bin/bowtie2/index/{0}".format(
    genomeName
)  # change this if your index is named differently from the genome
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
GEOids = [1658652]  # a small file for testing purposes
# Set this for for mapping .sra files
# You can do it like this:
# GEOids = range(1658523,1658540) + [398318, 398920,398921]  #taken from an actual study

Beispiel #16

0

Datei anzeigen

"""
This scripts is a rip-off of a large mergeDatasets script with certain adjustments.
Follow comments along the text.
"""

from hiclib.fragmentHiC import HiCdataset
import os

from mirnylib import genome

genomeDb = genome.Genome('../data/caul',
                         chrmFileTemplate="%s.fa",
                         readChrms=[])

for expName in os.listdir("caul"):

    TR = HiCdataset(
        "bla",
        genome=genomeDb,
        inMemory=True,
    )  # inMemory, as files are probably small (less than hundreds mililon reads)
    TR.parseInputData("caul/" + expName,
                      removeSS=True)  # We discard SS in our pipeline now
    TR.printMetadata()
    TR.filterRsiteStart(
        offset=5
    )  # We still do this filter to avoid strange "dangling end-like" molecules
    TR.filterDuplicates()
    #TR.save(out_file+".dat")
    TR.filterLarge(
        cutlarge=300000, cutsmall=100

Beispiel #17

0

Datei anzeigen

matplotlib.use('Agg')
import matplotlib.pyplot as plt
from lib_matrix_operations import *
import os

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC
from mirnylib.h5dict import h5dict

genome_name = 'mm9'
#genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X'])
#genome_name='hg19'
genome_db = genome.Genome('../fasta/' + genome_name, readChrms=['M'])

domain_res = 'fragment'

base_folder = '/mnt/storage/home/vsfishman/HiC/data/'
#base_filename = 'SRR443884_mESC_2'
base_filename = 'Fib_full2_chrM'

#raw="-raw"
raw = ""

#heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw
heatmap_filepath = base_folder + 'heatmap-res-fragment_KB_' + base_filename + '.hdf5' + raw

#figure_path=base_folder+base_filename+"_"+str(domain_res/1000)+'kb'+raw+'.eps'
figure_path = base_folder + base_filename + '_fragment_kb' + raw + '.eps'

Beispiel #18

0

Datei anzeigen

### Reading genome
logging.info("Preparation of genome...")
cmd_bgn_time = time.time()

PATH_ABSOLUTE = os.path.dirname(os.path.realpath(__file__))
fasta_path = os.path.join(
    PATH_ABSOLUTE, "data/genome/hg19.fa"
)  # Note that a single file with all chromosomes is preferred
chrms = [
    '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1', '20',
    '21', '22', '2', '3', '4', '5', '6', '7', '8', '9', 'M', 'X', 'Y'
]  # Note the chromosomes order
# For other genomes, you might want to use gap file, e.g. for hg38: http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/gap.txt.gz
#genome_db    = genome.Genome(fasta_path, readChrms=chrms, gapFile="gap.hg38.txt")
# For hg19, this variant works perfectly with mirnylib.genome
genome_db = genome.Genome(fasta_path, readChrms=chrms)
enzymes = ['NlaIII', 'MmeI', 'DpnII', 'MseI']

# Applying restriction enzymes
genome_db.setEnzyme('MmeI')

strands = [[] for i in range(len(genome_db.seqs))]
true_poss = [[] for i in range(len(genome_db.seqs))]
for i in range(len(genome_db.seqs)):
    s = genome_db.seqs[i]
    for pos in genome_db.rsites[i]:
        # Check that the found position is not located at the end of chromosome:
        if pos + 16 > len(s) or pos + 22 > len(
                s) or pos - 28 < 0 or pos - 22 < 0:
            strand = -1
            true_pos = -1

Beispiel #19

0

Datei anzeigen

    print "resolution defined by heatmap: ", domain_res
    return domain_res


domain_res = get_domain_res(heatmap_filepath)

print "Domain resolution = " + str(domain_res)
base_filename = heatmap_filepath.split("/")[-1].replace("-", "_")

#Define files and directories
base_folder = '/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomains'
HMM_file_path = '/mnt/storage/home/vsfishman/HiC/DI/domaincall_software/'
DI_from_matrix_file_path = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/DI_from_matrix_minja.pl"

genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    readChrms=[],
    chrmFileTemplate="%s.fna")

genome_fai_filepath = genome_db.genomePath + '/GalGal5ChrmLevel.fai'


def executeBashCommand(bashCommand, doPrint=True):
    print "executing command %s \n" % (bashCommand)
    p = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    if doPrint:
        print p.communicate()[0]
        return ""
    else:
        return p.communicate()[0]

Beispiel #20

0

Datei anzeigen

#!/usr/bin/env python

import sys

from hiclib import mapping, fragmentHiC
from mirnylib import h5dict, genome

basedir = sys.argv[1]
genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir,
                          readChrms=['1'],
                          chrmFileTemplate="%s.fa")
fragments = fragmentHiC.HiCdataset(filename='temp1',
                                   genome=genome_db,
                                   maximumMoleculeLength=500,
                                   mode='w',
                                   enzymeName="NcoI",
                                   inMemory=True)
fragments.load('%s/Data/Timing/hiclib_data.hdf5' % basedir)
fragments.filterDuplicates()
fragments.filterExtreme(cutH=0, cutL=0.005)
fragments.save('%s/Data/Timing/hiclib_data_filt.hdf5' % basedir)

Beispiel #21

0

Datei anzeigen

Datei: 01_mapData.py Projekt: labdevgen/FishHiC

#inFastqDir = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/"  # for mode="fastq" only

#sidePrefixes = ("side1", "side2")   # a version for naming ....side1.fastq.gz
sidePrefixes = (
    "R1_001", "R2_001"
)  # a prefix preceeding .fastq.gz, which will be used to distinguish side 1 and side 2
# If your files are named "run32167_something_side1_somethingElse.fastq.gz", then "side1_somethingElse" should be the prefix.

threads = 4
tmpDir = "/mnt/storage/home/vsfishman/HiC_temp/"  # this will contain up to 3X the size of the largest input .sra file (256GB is enough for (Rao, 2014), but 128 is not)
# Make sure your system drive (where /tmp usually is) has enough space. If it is a small SSD, it may not.
# Also, there is a lot of IO through the tmpDir. Put it on a local drive, not on a network drive, if you can.

genomeName = "mm10"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
    readChrms=["#", "X", "Y"],
)

bowtiePath = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/bowtie2"
bowtieIndex = "/mnt/storage/home/vsfishman/HiC/bin/bowtie2/index" + "/{0}".format(
    genomeName
)  # change this if your index is named differently from the genome
bowtieFlags = "--very-sensitive"

"IDs from GEO (SRR numbers)"
#GEOids = list(range(1665087,1665096))
#GEOids = [400251,400252,400253,400254,400255,400256,443883,443884,443885,443886,443887,443888]
#TODO 443883
#GEOids = [400253,443884,443885,443886,443887,443888]
sra_folder = "/mnt/storage/home/vsfishman/tmp/Distr/ESC/newESC/"
#GEOids = [f.split("SRR")[-1].split(".")[0] for f in os.listdir(sra_folder) if f.endswith(".sra")]

Beispiel #22

0

Datei anzeigen

print str(now)
print "\n"

import matplotlib.pyplot as plt
import numpy as np
import os
import subprocess

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC

genome_name = 'mm9'
genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X'])

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--res")
args = parser.parse_args()

domain_res = args.res

if (domain_res == None) or (domain_res == ""):
    domain_res = 100000

domain_res = int(domain_res)

print "Domain resolution = " + str(domain_res)

Beispiel #23

0

Datei anzeigen

def step1(
        hiclib_path,  ## the path of hiclib folder on machine
        dataset='Kalhor2012NB',
        sraid='SRR071231',
        readlen=40):  ## each read with length 40
    ''' 1. Map reads to the genome
        http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html
    '''

    ## Adopted from hiclib tutorial
    import os
    import logging
    from hiclib import mapping
    from mirnylib import h5dict, genome

    logging.basicConfig(level=logging.DEBUG)

    # A. Map the reads iteratively.
    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_1.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=0,
        seq_end=readlen,
        nthreads=12,  # on intel corei7 CPUs 4 threads are as fast as
        # 8, but leave some room for you other applications
        #max_reads_per_chunk = 10000000,  #optional, on low-memory machines
        temp_dir='../data/SRA/',  # optional, keep temporary files here
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    mapping.iterative_mapping(
        bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2',
        bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19',
        fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid +
        '.sra',
        out_sam_path='../data/SRA/' + sraid + '_2.bam',
        min_seq_len=25,
        len_step=5,
        seq_start=readlen,
        seq_end=2 * readlen,
        nthreads=12,
        #max_reads_per_chunk = 10000000,
        temp_dir='../data/SRA/',
        bowtie_flags='--very-sensitive',
        bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z')

    # B. Parse the mapped sequences into a Python data structure,
    #    assign the ultra-sonic fragments to restriction fragments.
    mapped_reads = h5dict.h5dict(sraid +
                                 '_mapped_reads.hdf5')  ## to local folder
    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam',
                      sam_basename2='../data/SRA/' + sraid + '_2.bam',
                      out_dict=mapped_reads,
                      genome_db=genome_db,
                      enzyme_name='HindIII')

Beispiel #24

0

Datei anzeigen

Datei: hiclibCorrelate.py Projekt: wyim-pgl/ngsane

def process():
    global options
    global args
    global pp

    outfilename = []
    # check dataset exist
    for i in xrange(len(args)):
        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-1M.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-1M.hdf5')
            sys.exit(1)

        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-200k.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-200k.hdf5')
            sys.exit(1)

        if (not os.path.isfile(args[i].replace('-fragment_dataset.hdf5',
                                               '-IC-1M.hdf5'))):
            print '[ERROR] Could not find: ' + args[i].replace(
                '-fragment_dataset.hdf5', '-IC-1M.hdf5')
            sys.exit(1)
        outfilename += [
            "_".join(
                os.path.basename(
                    args[i]).strip("-fragment_dataset.hdf5").split("_")[1:])
        ]

    genome_db = genome.Genome(options.genome,
                              gapFile=options.gapFile,
                              readChrms=['#', 'X', 'Y'])

    outfilename = "-".join(outfilename)
    outfile = open(options.outputDir + outfilename + '-HiC_correlate.txt', "w")
    fig = plt.figure()
    pp = PdfPages(options.outputDir + outfilename + '-HiC_correlate.pdf')

    for i in xrange(len(args)):
        print " Process file " + str(i) + ":" + args[i]
        enzyme_i = os.path.basename(args[i]).split("_")[0]
        experiment_i = "_".join(
            os.path.basename(
                args[i]).strip("-fragment_dataset.hdf5").split("_")[1:])
        for j in xrange(i + 1, len(args)):
            enzyme_j = os.path.basename(args[j]).split("_")[0]
            experiment_j = "_".join(
                os.path.basename(
                    args[j]).strip("-fragment_dataset.hdf5").split("_")[1:])

            compareCorrelationOfEigenvectors(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db)

            calculateTanayCorrelation(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db, outfile)

            plotDiagonalCorrelation(
                200000, args[i].replace('-fragment_dataset.hdf5',
                                        '-200k.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-200k.hdf5'),
                experiment_i, experiment_j, genome_db)

            compareInterarmMaps(
                1000000, args[i].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                args[j].replace('-fragment_dataset.hdf5', '-1M.hdf5'),
                experiment_i, experiment_j, genome_db)

    if (options.verbose):
        print >> sys.stdout, "print plots into pdf:%s" % (
            options.outputDir + outfilename + '-HiC_correlate.pdf')
    outfile.close()
    pp.close()

Beispiel #25

0

Datei anzeigen

Datei: 07_PlotFigure.py Projekt: labdevgen/FishHiC

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from lib_matrix_operations import *
import os

from mirnylib import genome
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC

#genome_name='mm9'
#genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X'])
genome_name='mm9'
genome_db = genome.Genome('../fasta/'+genome_name, readChrms=["#","X","Y"])

domain_res=200000

base_folder='/mnt/storage/home/vsfishman/HiC/data/'
#base_filename = 'SRR443884_mESC_2'
base_filename = 'Sp_full2_Y'
#base_filename = 'LA'


raw="-raw"
#raw=""

heatmap_filepath=base_folder+'heatmap-res-'+str(domain_res/1000)+'KB_'+base_filename+'.hdf5'+raw
#heatmap_filepath=base_folder+'heatmap-res-'+str('fragment_')+'KB_'+base_filename+'.hdf5'+raw

Beispiel #26

0

Datei anzeigen

Datei: prepGenomeLinkMatrix.py Projekt: labdevgen/FishHiC

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from mirnylib import genome
from mirnylib import h5dict
from hiclib import binnedData
import numpy as np
from mirnylib.systemutils import setExceptionHook

setExceptionHook()

########define file names and other params
#mirnylib genome params
genomeName = "GalGal5filtered"
genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered/",
    readChrms=[],
    chrmFileTemplate="N%s.fa")

#where to find hic lib heatmap
basefolder = "/mnt/storage/home/vsfishman/HiC/data/chick/mapped-GalGal5filtered/B1_TTAGGC_L001_/"
filename = "chunk0001.hdf5.hm-res-1000kb"

#resulting file name
out_file = "all.glm"

###parameters requerd by LACHESIS to be in the header
header_string = """# GenomeLinkMatrix file - see GenomeLinkMatrix.h for documentation of this object type
# Species = chick
# N_bins = 524
# bin_size = 0
# RE_sites_file = /mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/galGal5_all_contigs.filtered.fa.counts_AAGCTT.txt

Beispiel #27

0

Datei anzeigen

Datei: 04_filtering.py Projekt: magnitov/prokaryotic_cids

def filtration(chromosome_names, cell_line, path, genome_version, enzyme,
               resolution_list):
    for chrm_list in chromosome_names:
        genome_db = genome.Genome('/home/magnitov/data/genomes/' +
                                  genome_version,
                                  gapFile='gap.txt',
                                  readChrms=chrm_list,
                                  forceOrder=True)
        # Read mapped reads
        if len(chrm_list) > 1:
            fragments = fragmentHiC.HiCdataset(filename=path +
                                               'filtered_maps/' + cell_line +
                                               '/fragment_dataset_full.hdf5',
                                               genome=genome_db,
                                               enzymeName=enzyme,
                                               mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_full.hdf5')
        else:
            fragments = fragmentHiC.HiCdataset(
                filename=path + 'filtered_maps/' + cell_line +
                '/fragment_dataset_' + chrm_list[0] + '.hdf5',
                genome=genome_db,
                enzymeName=enzyme,
                mode='w')
            fragments.parseInputData(dictLike=path + 'maps/' + cell_line +
                                     '/mapped_reads_' + chrm_list[0] + '.hdf5')
        # Apply filters
        fragments.filterDuplicates()
        # Save statistics
        if len(chrm_list) > 1 or len(chromosome_names) == 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '.txt')
        if len(chrm_list) == 1 and len(chromosome_names) > 1:
            fragments.writeFilteringStats()
            fragments.printMetadata(saveTo=path + 'processing_stats/' +
                                    cell_line + '/processing_stats_' +
                                    cell_line + '_' + chrm_list[0] + '.txt')

        # Sort reads and calculate contact probability (both normalized and not)
        fragments._sortData()
        if len(chrm_list) > 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_full.txt',
                      header=1,
                      index=0,
                      sep='\t')

        if len(chrm_list) == 1:
            contact_probs = fragments.plotScaling(normalize=True, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '_norm.txt',
                      header=1,
                      index=0,
                      sep='\t')
            contact_probs = fragments.plotScaling(normalize=False, plot=False)
            pd.DataFrame({
                'Distance': contact_probs[0],
                'Probability': contact_probs[1]
            }).to_csv(path + 'contact_probs/' + cell_line + '/contact_probs_' +
                      cell_line + '_' + chrm_list[0] + '.txt',
                      header=1,
                      index=0,
                      sep='\t')

        # Save into .cool and .hdf5 files
        for resolution in resolution_list:
            fragments.saveCooler(filename=path + 'filtered_maps/' + cell_line +
                                 '/heatmap-' + chrm_list[0] + '-' +
                                 str(resolution / 1000) + 'K.cool',
                                 resolution=resolution)