def map_reads(cell_line, path_input, path_output, genome_version): if not os.path.exists(path_output + 'bam/' + cell_line): os.mkdir(path_output + 'bam/' + cell_line) mapping.iterative_mapping( bowtie_path='/usr/bin/bowtie2', bowtie_index_path=path_output + 'index_' + cell_line + '/' + genome_version, fastq_path=path_input + cell_line + '_R1.fastq.gz', out_sam_path=path_output + 'bam/' + cell_line + '/' + cell_line + '_R1.bam', min_seq_len=25, seq_start=4, len_step=3, nthreads=8, temp_dir=path_output + 'tmp_' + cell_line, bowtie_flags='--very-sensitive') mapping.iterative_mapping( bowtie_path='/usr/bin/bowtie2', bowtie_index_path=path_output + 'index_' + cell_line + '/' + genome_version, fastq_path=path_input + cell_line + '_R2.fastq.gz', out_sam_path=path_output + 'bam/' + cell_line + '/' + cell_line + '_R2.bam', min_seq_len=25, seq_start=4, len_step=3, nthreads=8, temp_dir=path_output + 'tmp_' + cell_line, bowtie_flags='--very-sensitive')
def step1(hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid = 'SRR071231', readlen = 40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path+'/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path+'/bin/bowtie2/index/hg19', fastq_path='../data/SRA/'+dataset+'/'+sraid+'/'+sraid+'.sra', out_sam_path='../data/SRA/'+sraid+'_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2*readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path+'/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam( sam_basename1='../data/SRA/'+sraid+'_1.bam', sam_basename2='../data/SRA/'+sraid+'_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
def doOne(inData, saveSams=True): file1, file2, outfile = inData print("Mapping {0} and {1} into {2}".format(*inData)) for onefile in file1, file2: a = gzip.open(onefile, 'r') a.readline() length = len(a.readline()) - 1 if length < 10: raise ValueError( "Length of your sequence is {0}. Something is wrong". format(length)) minlen, step = calculateStep(length - seqSkipStart, minMapLen) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=onefile, out_sam_path=os.path.join(samFolder, os.path.split(onefile)[1] + ".sam"), seq_start=seqSkipStart, min_seq_len= minlen, # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer len_step=step, # and go with a usualy step nthreads= threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, bowtie_flags=bowtieFlags, ) os.remove(file1) os.remove(file2) # Second step. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(outfile) sf1, sf2 = [ os.path.join(samFolder, os.path.split(onefile)[1] + ".sam") for onefile in [file1, file2] ] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=int(chunkSize * 1.6), IDLen=50) for i in os.listdir(samFolder): if ((os.path.split(file1)[1] in i) or (os.path.split(file2)[1] in i)) and not saveSams: print("deleting", i) os.remove(os.path.join(samFolder, i))
def map_reads(first_fq, second_fq, outfile, nice): # set the niceness of this sub-process: os.nice(nice) first_sam = first_fq.split(".fastq.gz")[0] + ".sam" second_sam = second_fq.split(".fastq.gz")[0] + ".sam" # map the first fastq file -> sam file length = check_len(first_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=first_fq, out_sam_path=os.path.join(args.samdir, first_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # map the second fastq file -> sam file length = check_len(second_fq) min_len, step_size = calculate_step(length - seq_skip_start, min_map_len) mapping.iterative_mapping( bowtie_path=bowtie_path, bowtie_index_path=bowtie_index, fastq_path=second_fq, out_sam_path=os.path.join(args.samdir, second_sam), min_seq_len=min_len, len_step=step_size, seq_start=seq_skip_start, nthreads=threads, bowtie_flags=bowtie_flags) # parse the mapped sequences into a the hdf5 dict structure, # assign the ultra-sonic fragments to restriction fragments. <- what the hell does this even mean? out_dict = os.path.join(args.samdir, outfile) mapped_reads = h5dict.h5dict(out_dict) sf1, sf2 = [os.path.join(args.samdir, first_sam), os.path.join(args.samdir, second_sam)] mapping.parse_sam(sam_basename1=sf1, sam_basename2=sf2, out_dict=mapped_reads, genome_db=genome_db, save_seqs=False, maxReads=10000000, IDLen=50, enzyme_name='HindIII')
def func(): #if not os.path.exists('tmp/'): # os.mkdir('tmp/') # Map the reads iteratively. from hiclib import mapping #from mirnylib import h5dict, genome mapping.iterative_mapping( bowtie_path='bowtie2', bowtie_index_path='../genomes/mm9/index/mm9', fastq_path=args.file, out_sam_path=path.join('/exports/eddie/scratch/s1529682/bams/', path.split(args.file)[1] + '.bam'), min_seq_len=25, len_step=5, nthreads=4, #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tempfile.gettempdir(), # optional, keep temporary files here bowtie_flags='--very-sensitive')
def mapFile(fastq, read): global options global args fileName, fileExtension = os.path.splitext(fastq) bamOutput = options.outputDir + fileName.split( os.sep)[-1] + '_R' + str(read) + '.bam' if (fileExtension == '.sra'): if (options.verbose): print >> sys.stdout, "Map short read archive %s utilizing %s" % ( fastq, options.sra) mapping.iterative_mapping(bowtie_path=options.bowtie, bowtie_index_path=options.index, fastq_path=fastq, out_sam_path=bamOutput, min_seq_len=25, len_step=5, seq_start=options.readLength * (read - 1), seq_end=options.readLength * (read), nthreads=options.cpus, temp_dir=options.tmpDir, bowtie_flags='--very-sensitive', bash_reader=options.sra + ' -Z') else: if (options.verbose): print >> sys.stdout, "Map fastq %s" % (fastq) mapping.iterative_mapping(bowtie_path=options.bowtie, bowtie_index_path=options.index, fastq_path=fastq, out_sam_path=bamOutput, min_seq_len=25, len_step=5, nthreads=options.cpus, temp_dir=options.tmpDir, bowtie_flags='--very-sensitive') return bamOutput
def mapFile(fastq, read): global options global args fileName, fileExtension = os.path.splitext(fastq) bamOutput = options.outputDir+fileName.split(os.sep)[-1]+'.bam' if (fileExtension == '.sra'): if (options.verbose): print >> sys.stdout, "Map short read archive %s utilizing %s" % (fastq, options.sra) mapping.iterative_mapping( bowtie_path=options.bowtie, bowtie_index_path=options.index, fastq_path=fastq, out_sam_path=bamOutput, min_seq_len=options.minSeqLength, len_step=options.stepSize, seq_start=options.readLength*(read-1), seq_end=options.readLength*(read), nthreads=options.cpus, temp_dir=options.tmpDir, bowtie_flags='--very-sensitive', bash_reader=options.sra+' -Z') else: if (options.verbose): print >> sys.stdout, "Map fastq %s" % (fastq) mapping.iterative_mapping( bowtie_path=options.bowtie, bowtie_index_path=options.index, fastq_path=fastq, out_sam_path=bamOutput, min_seq_len=options.minSeqLength, len_step=options.stepSize, nthreads=options.cpus, temp_dir=options.tmpDir, bowtie_flags='--very-sensitive') return bamOutput
FASTQ_fpath = tmp_folder + '/' + base_filename + '.sra' out_sam_fpath = tmp_folder + '/' + base_filename genome_name = 'mm9' if not os.path.exists(tmp_folder): os.mkdir(tmp_folder) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/' + genome_name, fastq_path=FASTQ_fpath, out_sam_path=out_sam_fpath + '_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=49, nthreads=8, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmp_folder, # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/' + genome_name, fastq_path=FASTQ_fpath, out_sam_path=out_sam_fpath + '_2.bam', min_seq_len=25, len_step=5,
for i in sorted(os.listdir(fastqDir)): expName = i folder = os.path.join(fastqDir, expName) file1 = glob.glob(folder+"/*1.fastq")[0] file2 = glob.glob(folder+"/*2.fastq")[0] if not os.path.exists(file1): raise if not os.path.exists(file2): raise # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='sams/%s_1.bam' % expName, min_seq_len=10, # for bacteria mimimal mappable length is slightly over 10bp, so I start with 10bp len_step=3, # and go with a smaller step seq_start=0, seq_end=40, nthreads=4, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file2, out_sam_path='sams/%s_2.bam' % expName, min_seq_len=10, len_step=3, seq_start=0,
FASTQ_fpath="/mnt/storage/home/vsfishman/tmp/Distr/LA2008_NcoI/LA.fastq" out_sam_fpath=tmp_folder+'/'+base_filename genome_name='mm9' if not os.path.exists(tmp_folder): os.mkdir(tmp_folder) #A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path='../bin/bowtie2/bowtie2', bowtie_index_path='../bin/bowtie2/index/'+genome_name, fastq_path=FASTQ_fpath, out_sam_path=out_sam_fpath+'_2.bam', min_seq_len=25, len_step=5, seq_start=76, seq_end=151, nthreads=8, #max_reads_per_chunk = 10000000, temp_dir=tmp_folder, bowtie_flags='--very-sensitive', bash_reader=None)#../../bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. #mapped_reads = h5dict.h5dict(maped_reads_filepath) #genome_db = genome.Genome('../fasta/'+genome_name, readChrms=['#', 'X']) #mapping.parse_sam( #sam_basename1=out_sam_fpath+'_1.bam',
lock.close() atexit.register(cleanFile, lockName) os.system("rm -rf {0}/{1}*".format(samFolder, expName.replace(".sra", ""))) # First step. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_1.bam'.format(samFolder, expName), min_seq_len= minlen, # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer len_step=step, # and go with a usualy step nthreads=threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, seq_start=0, seq_end=length, bash_reader="fastq-dump -Z", bowtie_flags=" --very-sensitive ", ) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName), min_seq_len=minlen,
import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) if not os.path.exists('../data/tmp'): os.mkdir('../data/tmp') # Map the reads iteratively. mapping.iterative_mapping( bowtie_path='/usr/bin/bowtie2', bowtie_index_path='../Index/hg19', fastq_path='../data/SRR1658595_10M_1.fastq', out_sam_path='../data/SRR1658595_10M_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=35, nthreads=2, temp_dir='../data/tmp', bowtie_flags='--very-sensitive') mapping.iterative_mapping( bowtie_path='/usr/bin/bowtie2', bowtie_index_path='../Index/hg19', fastq_path='../data/SRR1658595_10M_2.fastq', out_sam_path='../data/SRR1658595_10M_2.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=35,
def step1( hiclib_path, ## the path of hiclib folder on machine dataset='Kalhor2012NB', sraid='SRR071231', readlen=40): ## each read with length 40 ''' 1. Map reads to the genome http://mirnylab.bitbucket.org/hiclib/tutorial/01_iterative_mapping.html ''' ## Adopted from hiclib tutorial import os import logging from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=readlen, nthreads=12, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../data/SRA/', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path=hiclib_path + '/bin/bowtie2/bowtie2', bowtie_index_path=hiclib_path + '/bin/bowtie2/index/hg19', fastq_path='../data/SRA/' + dataset + '/' + sraid + '/' + sraid + '.sra', out_sam_path='../data/SRA/' + sraid + '_2.bam', min_seq_len=25, len_step=5, seq_start=readlen, seq_end=2 * readlen, nthreads=12, #max_reads_per_chunk = 10000000, temp_dir='../data/SRA/', bowtie_flags='--very-sensitive', bash_reader=hiclib_path + '/bin/sra/bin/fastq-dump -Z') # B. Parse the mapped sequences into a Python data structure, # assign the ultra-sonic fragments to restriction fragments. mapped_reads = h5dict.h5dict(sraid + '_mapped_reads.hdf5') ## to local folder genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) mapping.parse_sam(sam_basename1='../data/SRA/' + sraid + '_1.bam', sam_basename2='../data/SRA/' + sraid + '_2.bam', out_dict=mapped_reads, genome_db=genome_db, enzyme_name='HindIII')
from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) if not os.path.exists('../../data/sample/tmp/'): os.mkdir('../../data/sample/tmp/') # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/hg19', fastq_path='../../data/sample/SRR027956.sra', out_sam_path='../../data/sample/SRR027056_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=75, nthreads=4, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../../data/sample/tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive', bash_reader='../../bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/hg19', fastq_path='../../data/sample/SRR027956.sra', out_sam_path='../../data/sample/SRR027056_2.bam', min_seq_len=25, len_step=5,
from hiclib import mapping from mirnylib import h5dict, genome logging.basicConfig(level=logging.DEBUG) if not os.path.exists('../../data/serov/tmp/'): os.mkdir('../../data/serov/tmp/') # A. Map the reads iteratively. mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/mm10', fastq_path='../../data/serov/HiC_Sp.fastq', out_sam_path='../../data/serov/HiC_Sp_1.bam', min_seq_len=25, len_step=5, seq_start=0, seq_end=50, nthreads=4, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications #max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir='../../data/serov/tmp', # optional, keep temporary files here bowtie_flags='--very-sensitive') # bash_reader='../../bin/sra/bin/fastq-dump -Z') mapping.iterative_mapping( bowtie_path='../../bin/bowtie2/bowtie2', bowtie_index_path='../../bin/bowtie2/index/mm10', fastq_path='../../data/serov/HiC_Sp.fastq', out_sam_path='../../data/serov/HiC_Sp_2.bam', min_seq_len=25, len_step=5,
os.system("rm -rf {0}/{1}*".format(samFolder, expName.replace(".sra", ""))) # First step. Map the reads iteratively. mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_1.bam'.format(samFolder, expName), min_seq_len=minlen, # for bacteria mimimal mappable length is 15 bp, so I start with something slightly longer len_step=step, # and go with a usualy step nthreads=threads, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications # max_reads_per_chunk = 10000000, #optional, on low-memory machines temp_dir=tmpDir, seq_start=0, seq_end=length, bash_reader="fastq-dump -Z", bowtie_flags=" --very-sensitive ", ) mapping.iterative_mapping( bowtie_path=bowtiePath, bowtie_index_path=bowtieIndex, fastq_path=file1, out_sam_path='{0}/{1}_2.bam'.format(samFolder, expName), min_seq_len=minlen, len_step=step,