def step1(hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid = 'SRR071231', readlen = 40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2*readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam( sam_basename1='../data/SRA/'+sraid+'_1.bam', sam_basename2='../data/SRA/'+sraid+'_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
def collectMappedReads(bam_read1, bam_read2, mapped_reads, genome_db): global options global args mapping.parse_sam(sam_basename1=bam_read1, sam_basename2=bam_read2, out_dict=mapped_reads, genome_db=genome_db, enzyme_name=options.enzyme)
def collectMappedReads(bam_read1, bam_read2, mapped_reads, genome_db): global options global args mapping.parse_sam( sam_basename1=bam_read1, sam_basename2=bam_read2, out_dict=mapped_reads, genome_db=genome_db, enzyme_name=options.enzyme)
def doOne(inData, saveSams=True): file1, file2, outfile = inData print("Mapping {0} and {1} into {2}".format(*inData)) for onefile in file1, file2: a = gzip.open(onefile, 'r') a.readline() length = len(a.readline()) - 1 if length < 10: raise ValueError( "Length of your sequence is {0}. Something is wrong". format(length)) minlen, step = calculateStep(length - seqSkipStart, minMapLen) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=onefile, out_sam_path=os.path.join(samFolder, os.path.split(onefile)[1] + ".sam"), seq_start=seqSkipStart, min_seq_len= minlen, # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer len_step=step, # and go with a usualy step nthreads= threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, bowtie_flags=bowtieFlags, ) os.remove(file1) os.remove(file2) # Second step. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(outfile) sf1, sf2 = [ os.path.join(samFolder, os.path.split(onefile)[1] + ".sam") for onefile in [file1, file2] ] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=int(chunkSize * 1.6), IDLen=50) for i in os.listdir(samFolder): if ((os.path.split(file1)[1] in i) or (os.path.split(file2)[1] in i)) and not saveSams: print("deleting", i) os.remove(os.path.join(samFolder, i))
def func(): mapping.parse_sam( sam_basename1='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_1.fq.gz'+chunk, sam_basename2='/exports/eddie/scratch/s1529682/bams/'+basename+'_fixed_2.fq.gz'+chunk, out_dict=mapped_reads, genome_db=genome_db, enzyme_name='DpnII') fragments = fragmentHiC.HiCdataset( filename=fragments_file, genome=genome_db, maximumMoleculeLength=700, mode='w') # Load the parsed reads into the HiCdataset. The dangling-end filter is applied # at this stage, with maximumMoleculeLength specified at the initiation of the # object. fragments.parseInputData(dictLike=reads_file)
def map_reads(first_fq, second_fq, outfile, nice): # set the niceness of this sub-process: os.nice(nice) first_sam = first_fq.split(".fastq.gz")[0] + ".sam" second_sam = second_fq.split(".fastq.gz")[0] + ".sam" # map the first fastq file -> sam file length = check_len(first_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=first_fq, out_sam_path=os.path.join(args.samdir, first_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # map the second fastq file -> sam file length = check_len(second_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=second_fq, out_sam_path=os.path.join(args.samdir, second_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # parse the mapped sequences into a the hdf5 dict structure, # assign the ultra-sonic fragments to restriction fragments. <- what the hell does this even mean? out_dict = os.path.join(args.samdir, outfile) mapped_reads = h5dict.h5dict(out_dict) sf1, sf2 = [os.path.join(args.samdir, first_sam), os.path.join(args.samdir, second_sam)] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=10000000, IDLen=50, enzyme_name='HindIII')
def parse_bams(chromosome_names, cell_line, path, genome_version, enzyme): if not os.path.exists(path + 'maps/' + cell_line): os.mkdir(path + 'maps/' + cell_line) for chrm_list in chromosome_names: if len(chrm_list) > 1: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_full.hdf5') else: mapped_reads = h5dict.h5dict(path + 'maps/' + cell_line + '/mapped_reads_' + chrm_list[0] + '.hdf5') genome_db = genome.Genome('/home/magnitov/data/genomes/' + genome_version, gapFile = 'gap.txt' , readChrms = chrm_list, forceOrder = True) mapping.parse_sam( sam_basename1 = path + 'bam/' + cell_line + '/' + cell_line + '_R1.bam', sam_basename2 = path + 'bam/' + cell_line + '/' + cell_line + '_R2.bam', out_dict = mapped_reads, genome_db = genome_db, enzyme_name = enzyme)
# temp_dir=tmp_folder, # optional, keep temporary files here # bowtie_flags='--very-sensitive', # bash_reader=None)#../../bin/sra/bin/fastq-dump -Z') #mapping.iterative_mapping( # bowtie_path='../bin/bowtie2/bowtie2', # bowtie_index_path='../bin/bowtie2/index/'+genome_name, # fastq_path=FASTQ_fpath, # out_sam_path=out_sam_fpath+'_2.bam', # min_seq_len=25, # len_step=5, # seq_start=40, # seq_end=79, # nthreads=8, ## max_reads_per_chunk = 10000000, # temp_dir=tmp_folder, # bowtie_flags='--very-sensitive', # bash_reader=None)#../../bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(maped_reads_filepath) genome_db = genome.Genome('../fasta/' + genome_name, readChrms=['#', 'X', 'M']) mapping.parse_sam(sam_basename1=out_sam_fpath + '_1.bam', sam_basename2=out_sam_fpath + '_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='MboI', save_seqs=True)
fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5] in_prefices = sys.argv[5:] basedir = os.path.split(os.path.abspath(out_fname))[0] mapped_reads = [] for prefix in in_prefices: mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix))) genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa") for i, name in enumerate(mapped_reads): mapping.parse_sam(sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]), sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]), out_dict=name, genome_db=genome_db, enzyme_name=re_name) for i, name in enumerate(mapped_reads): fragments = fragmentHiC.HiCdataset(filename='temp', genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName=re_name, inMemory=True) fragments.parseInputData(dictLike="%s/%s.hdf5" % (basedir, prefix)) if i != len(mapped_reads) - 1: fragments.save("%s/%s_data.hdf5" % (basedir, prefix)) else: frag_files = []
#max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmp_folder, # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/' + genome_name, fastq_path=FASTQ_fpath, out_sam_path=out_sam_fpath + '_2.bam', min_seq_len=25, len_step=5, seq_start=50, seq_end=99, nthreads=8, #max_reads_per_chunk = 10000000, temp_dir=tmp_folder, bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(maped_reads_filepath) genome_db = genome.Genome('../../fasta/' + genome_name, readChrms=['#', 'X']) mapping.parse_sam(sam_basename1=out_sam_fpath + '_1.bam', sam_basename2=out_sam_fpath + '_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
temp_dir='tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file2, out_sam_path='sams/%s_2.bam' % expName, min_seq_len=10, len_step=3, seq_start=0, seq_end=40, nthreads=4, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict('caul/%s' % expName) genome_db = genome.Genome('../data/caul', chrmFileTemplate="%s.fa", readChrms=[]) mapping.parse_sam( sam_basename1='sams/%s_1.bam' % expName, sam_basename2='sams/%s_2.bam' % expName, out_dict=mapped_reads, genome_db=genome_db, enzyme_name='BglII')
import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads_Sp1 = h5dict.h5dict('../../data/serov/mapped_reads_Sp1.hdf5') genome_db = genome.Genome('../../fasta/mm10', readChrms=['#', 'X']) mapping.parse_sam( sam_basename1='../../data/serov/HiC_Sp1_1.bam', sam_basename2='../../data/serov/HiC_Sp1_2.bam', out_dict=mapped_reads_Sp1, genome_db=genome_db, enzyme_name='HindIII')
def step1( hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid='SRR071231', readlen=40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2 * readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam', sam_basename2='../data/SRA/' + sraid + '_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
#max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../../data/sample/tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/hg19', fastq_path='../../data/sample/SRR027956.sra', out_sam_path='../../data/sample/SRR027056_2.bam', min_seq_len=25, len_step=5, seq_start=76, seq_end=151, nthreads=4, #max_reads_per_chunk = 10000000, temp_dir='../../data/sample/tmp', bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict('../../data/sample/mapped_reads.hdf5') genome_db = genome.Genome('../../fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam(sam_basename1='../../data/sample/SRR027056_1.bam', sam_basename2='../../data/sample/SRR027056_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome fasta_dir, re_name, out_fname, in_dir = sys.argv[1:5] in_prefices = sys.argv[5:] basedir = os.path.split(os.path.abspath(out_fname))[0] mapped_reads = [] for prefix in in_prefices: mapped_reads.append(h5dict.h5dict('%s/%s.hdf5' % (basedir, prefix))) genome_db = genome.Genome(fasta_dir, readChrms=['#', 'X'], chrmFileTemplate="%s.fa") for i, name in enumerate(mapped_reads): mapping.parse_sam( sam_basename1="%s/%s_1.bam" % (in_dir, in_prefices[i]), sam_basename2="%s/%s_2.bam" % (in_dir, in_prefices[i]), out_dict=name, genome_db=genome_db, enzyme_name=re_name) for i, name in enumerate(mapped_reads): fragments = fragmentHiC.HiCdataset( filename='temp', genome=genome_db, maximumMoleculeLength=500, mode='w', enzymeName=re_name, inMemory=True) fragments.parseInputData(dictLike="%s/%s.hdf5" % (basedir, prefix)) if i != len(mapped_reads) - 1: fragments.save("%s/%s_data.hdf5" % (basedir, prefix)) else:
#!/usr/bin/env python import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) mapped_reads = h5dict.h5dict('./mapped_reads.hdf5') genome_db = genome.Genome('../Ref/hg19', readChrms=['#','X']) mapping.parse_sam( sam_basename1='../data/SRR1658595_10M_1.bam', sam_basename2='../data/SRR1658595_10M_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='MboI')
import sys from hiclib import mapping, fragmentHiC from mirnylib import h5dict, genome basedir = sys.argv[1] mapped_reads1 = h5dict.h5dict('%s/Data/Timing/mapped_reads1.hdf5' % basedir) mapped_reads2 = h5dict.h5dict('%s/Data/Timing/mapped_reads2.hdf5' % basedir) mapped_reads3 = h5dict.h5dict('%s/Data/Timing/mapped_reads3.hdf5' % basedir) genome_db = genome.Genome('%s/Data/Genome/mm9_fasta' % basedir, readChrms=['1'], chrmFileTemplate="%s.fa") mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443886_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443886_sub_2.bam' % basedir, out_dict=mapped_reads1, genome_db=genome_db, enzyme_name='NcoI') mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443887_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443887_sub_2.bam' % basedir, out_dict=mapped_reads2, genome_db=genome_db, enzyme_name='NcoI') mapping.parse_sam( sam_basename1='%s/Data/Timing/SRR443888_sub_1.bam' % basedir, sam_basename2='%s/Data/Timing/SRR443888_sub_2.bam' % basedir, out_dict=mapped_reads3, genome_db=genome_db,
) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName), min_seq_len=minlen, len_step=step, nthreads=threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, seq_start=length, seq_end=2 * length, bash_reader="fastq-dump -Z", bowtie_flags=" --very-sensitive ", ) # Second step. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(finalName) mapping.parse_sam(sam_basename1='{0}/{1}_1.bam'.format(samFolder, expName), sam_basename2='{0}/{1}_2.bam'.format(samFolder, expName), out_dict=mapped_reads, genome_db=genome_db, save_seqs=False) os.remove(lockName)
mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName), min_seq_len=minlen, len_step=step, nthreads=threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, seq_start=length, seq_end=2 * length, bash_reader="fastq-dump -Z", bowtie_flags=" --very-sensitive ", ) # Second step. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(finalName) mapping.parse_sam( sam_basename1='{0}/{1}_1.bam'.format(samFolder, expName), sam_basename2='{0}/{1}_2.bam'.format(samFolder, expName), out_dict=mapped_reads, genome_db=genome_db, save_seqs=False) os.remove(lockName)