def call_variants_mpileup_lsf(bams,ref,outroot,vcfbase,njobs=100,mpileup_args='',gatk_jar=gatk_jar,gatk_ram=8,tmpdir=None,queue='normal_serial',job_ram='30000',MAX_RETRY=MAX_RETRY,include_regions=None,fallback_queue=''): if tmpdir is None: tmpdir = os.path.join(outroot,'gatk_tmp') bamstr = ' -I '.join(bams) regions = partition_reference(ref,njobs,include_regions) vcfbasename = vcfbase.endswith('.vcf') and vcfbase[:-4] or vcfbase mpoutvcfbase = '%s-mpileup' % (vcfbasename) mpoutvcf = os.path.join(outroot,mpoutvcfbase+'.vcf') vcf_parts_root = os.path.join(outroot,mpoutvcfbase+'-vcf_parts') try: os.makedirs(vcf_parts_root) except: pass to_run_dict = {} #merge_subparts_trd = {} subparts = [] for i,reg in enumerate(regions): start,end = start_end_strs(reg) #regstr = ' -L '.join(reg) partdonebase = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s-parts' % (mpoutvcfbase,i,len(regions),start,end)) partvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.vcf' % (mpoutvcfbase,i,len(regions),start,end)) part_sh = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.sh' % (mpoutvcfbase,i,len(regions),start,end)) #cmd = 'java -Xmx%sg -Djava.io.tmpdir=%s -jar %s -R %s -T %s -o %s %s -I %s -L %s' % (gatk_ram,tmpdir,gatk_jar,ref,gatk_program,partvcf,gatk_args,bamstr,regstr) this_trd = {} for this_reg in reg: subpart = this_reg.split(':')[0] subpartvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s-%s.vcf' % (mpoutvcfbase,i,len(regions),start,end,subpart)) this_cmd = 'samtools mpileup -Dgu -r %s %s -f %s %s | bcftools view -cvg - > %s 2> %s.log' % (this_reg,mpileup_args,ref,bamstr,subpartvcf,subpartvcf) this_trd[subpartvcf] = run_safe.safe_script(this_cmd,subpartvcf) cmd_parts = unfinished_cmds(this_trd) cmd = '; '.join(cmd_parts) #open(part_sh,'w').write('#!/usr/bin/env bash\n'+cmd+'\n') #os.system('chmod +x %s' % part_sh) to_run_dict[partdonebase] = run_safe.safe_script(cmd,partdonebase) subparts.extend(this_trd.keys()) #vcfparts = this_trd.keys() ### <---MAKE THIS WORK (merge in parts before merge all) #merge_subparts_trd[partvcf] = run_safe.safe_script(merge_vcf_parts_cmd(vcfparts,ref,partvcf,gatk_jar,gatk_ram,tmpdir,rod_type = ':VCF'),partvcf) #SLURM here logfile = os.path.join(vcf_parts_root,'logs','mpileup-parts') schedule_jobs(to_run_dict,opts.scheduler,'mpileup',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY) #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'mpileup',njobs,MAX_RETRY) cmd = run_safe.safe_script(merge_vcf_parts_cmd(subparts,ref,mpoutvcf,gatk_jar,gatk_ram,tmpdir),mpoutvcf) ret = os.system(cmd) if ret != 0: raise OSError, 'VCF merge failed:\n%s' % cmd
def run_parallel_blat(subjects,queries,blattile,blatargstr='',num_cores='+0'): ''' runs blat commands using GUN parallel. ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile)/2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','') labf.append(outbase+'.label.gz') cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase) cmds.append(run_safe.safe_script(cmd,outbase)) shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh') smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds]) os.system('chmod +x '+shscr) ret = os.system('parallel --progress -j %s < %s' % (num_cores,shscr)) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def run_lsf_blat(subjects, queries, blattile, blatargstr='', num_batches=100, queue='normal_serial'): '''submits mcl_id_triples_by_blat.py jobs to LSF intended as an example of parallelization over a compute grid; uses a module LSF.py for interaction with scheduler ''' import LSF, run_safe blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile) / 2) #cmds = [] labf = [] to_run_dict = {} for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip( '_query') + '_blat' + '-subj' + subjname + blatargstr.replace( '=', '').replace(' ', '') labf.append(outbase + '.label.gz') # ESCAPES UNNECESSARY WITH safe_script #cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase)) cmd = '%s %s %s %s "%s" %s' % ( sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'), subject, q, blatargstr, outbase) to_run_dict[outbase] = run_safe.safe_script(cmd, outbase) logfile = os.path.join(os.path.dirname(subjects[0]), 'blat-log/blat-log') LSF.lsf_run_until_done(to_run_dict, logfile, queue, '-R "select[mem > 20000]"', 'blat2mat', num_batches, 3) # REPLACED BY lsf_run_until_done ABOVE #logfiles = glob(logfile+'*.lsflog') #for lf in logfiles: # try: # os.unlink(lf) # except: # pass #print >> sys.stderr, 'LSF %s\nlog: %s' % (cmds,logfile) #import time #while len(cmds) > 0: # jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches) # time.sleep(20) # LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict) # logfiles = glob(logfile+'*.lsflog') # cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles]) if not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed' return labf
def run_local_blat(subjects,queries,blattile,blatargstr='',num_cores=1): ''' runs blat commands using os.system() runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat() ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile)/2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','') labf.append(outbase+'.label.gz') cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase) cmds.append(run_safe.safe_script(cmd,outbase)) shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh') smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds]) os.system('chmod +x '+shscr) ret = os.system(shscr) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def run_parallel_blat(subjects, queries, blattile, blatargstr='', num_cores='+0'): ''' runs blat commands using GUN parallel. ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile) / 2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip( '_query') + '_blat' + '-subj' + subjname + blatargstr.replace( '=', '').replace(' ', '') labf.append(outbase + '.label.gz') cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % ( radtag_denovo, subject, q, blatargstr, outbase) cmds.append(run_safe.safe_script(cmd, outbase)) shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh') smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds]) os.system('chmod +x ' + shscr) ret = os.system('parallel --progress -j %s < %s' % (num_cores, shscr)) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def run_local_blat(subjects, queries, blattile, blatargstr='', num_cores=1): ''' runs blat commands using os.system() runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat() ''' blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile) / 2) cmds = [] labf = [] for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip( '_query') + '_blat' + '-subj' + subjname + blatargstr.replace( '=', '').replace(' ', '') labf.append(outbase + '.label.gz') cmd = '%s %s %s %s "%s" %s' % ( sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'), subject, q, blatargstr, outbase) cmds.append(run_safe.safe_script(cmd, outbase)) shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh') smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds]) os.system('chmod +x ' + shscr) ret = os.system(shscr) if ret != 0 or not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed with code %s' % ret return labf
def overlap_by_seqprep(r1, r2, outbase, pct_id=0.8, min_ol=10, adaptA='GATCGGAAGAGCACACG', adaptB='AGATCGGAAGAGCGTCGT'): '''adaptA is the adapter 1 sequence AS IT APPEARS IN READ 1, likewise adaptB is adapter 2 sequence AS IT APPEARS IN READ 2. in other words, DB_adapt_trim_seqs r1 (read 1 adapter read-through) is A, r2 is B ''' trim1 = outbase + '.R1.trim.fastq.gz' trim2 = outbase + '.R2.trim.fastq.gz' drop1 = outbase + '.R1.drop.fastq.gz' drop2 = outbase + '.R2.drop.fastq.gz' merge = outbase + '.merge.fastq.gz' aln = outbase + '.merge.aln.gz' cmd = 'SeqPrep -f %s -r %s -1 %s -2 %s -3 %s -4 %s -A %s -B %s -s %s -E %s -o %s -m %s -n %s' % ( r1, r2, trim1, trim2, drop1, drop2, adaptA, adaptB, merge, aln, min_ol, 1 - pct_id, pct_id) ss = run_safe.safe_script(cmd, outbase + '-seqprep', force_write=True) ret = os.system(ss) if ret != 0: raise OSError, 'seqprep run failed' return merge, trim1, trim2
def submit_runs(vidroot,default_start, \ nframes,nparts,nstep,param_queue,seglen, \ ground_improvement,ground_suppress, \ outline_engine, \ num_jobs_running_max,num_jobs_new_max,skip_fn=skip_fn): num_current = get_current_run_count() num_new = num_jobs_running_max - num_current if num_new < 1: print >> sys.stderr, 'number of currently running jobs (%s) meets or exceeds max concurrent (%s)' % (num_current, num_jobs_running_max) return None else: launched = 0 for cfg in sorted(glob(os.path.join(vidroot,'*/*-config.dict'))): if launched == num_new or launched >= num_jobs_new_max: break currjobs = Popen('bjobs -w',shell=True,stdout=PIPE).stdout.read() print >> sys.stderr, cfg,'\t', vid = cfg.split('-config')[0]+'.mp4' if not os.path.exists(vid): print >> sys.stderr, 'video removed; skipping' continue if os.path.exists(skip_fn(vid)): print >> sys.stderr, 'skip flag found; skipping' continue if vidtools.vid_duration(vid) < MIN_VID_DUR: #only analyze videos longer than 8hrs print >> sys.stderr, 'too short; skip' continue donebase = '%s-l%snp%snf%sns%sgi%sgs%soe%s' % (vid[:-4],seglen,nparts,nframes,nstep,ground_improvement,ground_suppress,outline_engine) vs = 'np%snf%sns%sgi%sgs%soe%s' % (nparts,nframes,nstep,ground_improvement,ground_suppress,outline_engine) attempts_dir = donebase+'-ATTEMPTS' if os.path.exists(donebase+'.done'): print >> sys.stderr, 'done' elif donebase in currjobs: print >> sys.stderr, 'running' elif os.path.exists(attempts_dir) and len(glob(os.path.join(attempts_dir,'attempt*'))) >= MAX_RETRY: nrrc = next_rerun_condition(cfg,RERUN_COEFFS, \ nframes,nparts,nstep,seglen, \ ground_improvement,ground_suppress, \ outline_engine,return_state=True) if nrrc is None: print >> sys.stderr, 'too many attempts (%s) for all conditions (%s); see %s' % (len(glob(os.path.join(attempts_dir,'attempt*'))),RERUN_COEFFS,attempts_dir) else: thresh_coeff,state = nrrc print >> sys.stderr, 'rerun %s %s' % (thresh_coeff,state) else: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,ground_improvement,ground_suppress,outline_engine,cfg,vs, vid) logfile = donebase+'.lsflog' ss = run_safe.safe_script(cmd,donebase,force_write=True) subcmd = 'bsub -q %s -o %s %s' % (QUEUE,logfile,ss) #print >> sys.stderr, '\n\t',subcmd ret = os.system(subcmd) launched += 1 if ret == 0: if not os.path.exists(attempts_dir): os.makedirs(attempts_dir) at_ret = os.system('touch %s' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S'))) if ret != 0: print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S')) else: errstr = 'submission of job failed:\n%s' % subcmd raise OSError, errstr
def map_by_stampy(reads,reference,mapped,stampy_args='--maxbasequal=60 --bwamark',make_index=True,force_index=False): if make_index: make_stampy_ref_idx(reference,force_index=force_index) cmd = 'stampy.py --overwrite %s -h %s -g %s -M %s -o %s.sam; samtools view -bS %s.sam > %s.bam' % (stampy_args,reference,reference, reads, mapped,mapped,mapped) ss = run_safe.safe_script(cmd,mapped,force_write=True) ret = os.system(ss) if ret == 0 and os.path.exists(mapped+'.bam'): print >> sys.stderr, '%s.bam created' % mapped else: errstr = 'mapping %s to %s failed' % (reads, reference) raise OSError, errstr return mapped+'.bam'
def run_lsf_blat(subjects,queries,blattile,blatargstr='',num_batches=100,queue='normal_serial'): '''submits mcl_id_triples_by_blat.py jobs to LSF intended as an example of parallelization over a compute grid; uses a module LSF.py for interaction with scheduler ''' import LSF,run_safe blatargstr += ' -tileSize=%s' % blattile blatargstr += ' -stepSize=%s' % (int(blattile)/2) #cmds = [] labf = [] to_run_dict = {} for q in queries: for subject in subjects: subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj') outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','') labf.append(outbase+'.label.gz') # ESCAPES UNNECESSARY WITH safe_script #cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase)) cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase) to_run_dict[outbase] = run_safe.safe_script(cmd,outbase) logfile = os.path.join(os.path.dirname(subjects[0]),'blat-log/blat-log') LSF.lsf_run_until_done(to_run_dict, logfile, queue, '-R "select[mem > 20000]"', 'blat2mat', num_batches, 3) # REPLACED BY lsf_run_until_done ABOVE #logfiles = glob(logfile+'*.lsflog') #for lf in logfiles: # try: # os.unlink(lf) # except: # pass #print >> sys.stderr, 'LSF %s\nlog: %s' % (cmds,logfile) #import time #while len(cmds) > 0: # jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches) # time.sleep(20) # LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict) # logfiles = glob(logfile+'*.lsflog') # cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles]) if not all([os.path.exists(f) for f in labf]): raise OSError, 'blat failed' return labf
def map_by_bwa(reads,reference,mapped,bwa_args='',make_index=True,force_index=False): if make_index: make_bwa_ref_idx(reference,force_index=force_index) if reads.endswith('.bam'): cmd = 'samtools bam2fq %s | bwa mem %s %s - | samtools view -bS - > %s.bam' % (reads,bwa_args, reference, mapped) else: cmd = 'bwa mem %s %s %s | samtools view -bS - > %s.bam' % (bwa_args, reference, reads, mapped) ss = run_safe.safe_script(cmd,mapped,force_write=True) ret = os.system(ss) if ret == 0 and os.path.exists(mapped+'.bam'): print >> sys.stderr, '%s.bam created' % mapped else: errstr = 'mapping %s to %s failed' % (reads, reference) raise OSError, errstr return mapped+'.bam'
def overlap_by_seqprep(r1,r2,outbase,pct_id=0.8,min_ol=10,adaptA='GATCGGAAGAGCACACG',adaptB='AGATCGGAAGAGCGTCGT'): '''adaptA is the adapter 1 sequence AS IT APPEARS IN READ 1, likewise adaptB is adapter 2 sequence AS IT APPEARS IN READ 2. in other words, DB_adapt_trim_seqs r1 (read 1 adapter read-through) is A, r2 is B ''' trim1 = outbase+'.R1.trim.fastq.gz' trim2 = outbase+'.R2.trim.fastq.gz' drop1 = outbase+'.R1.drop.fastq.gz' drop2 = outbase+'.R2.drop.fastq.gz' merge = outbase+'.merge.fastq.gz' aln = outbase+'.merge.aln.gz' cmd = 'SeqPrep -f %s -r %s -1 %s -2 %s -3 %s -4 %s -A %s -B %s -s %s -E %s -o %s -m %s -n %s' % (r1,r2,trim1,trim2,drop1,drop2,adaptA,adaptB,merge,aln,min_ol,1-pct_id,pct_id) ss = run_safe.safe_script(cmd,outbase+'-seqprep',force_write=True) ret = os.system(ss) if ret != 0: raise OSError, 'seqprep run failed' return merge,trim1,trim2
def map_by_stampy(reads, reference, mapped, stampy_args='--maxbasequal=60 --bwamark', make_index=True, force_index=False): if make_index: make_stampy_ref_idx(reference, force_index=force_index) cmd = 'stampy.py --overwrite %s -h %s -g %s -M %s -o %s.sam; samtools view -bS %s.sam > %s.bam' % ( stampy_args, reference, reference, reads, mapped, mapped, mapped) ss = run_safe.safe_script(cmd, mapped, force_write=True) ret = os.system(ss) if ret == 0 and os.path.exists(mapped + '.bam'): print >> sys.stderr, '%s.bam created' % mapped else: errstr = 'mapping %s to %s failed' % (reads, reference) raise OSError, errstr return mapped + '.bam'
def submit_one(cfg, default_start, \ nframes,nparts,nstep,param_queue,seglen, \ ground_improvement,ground_suppress,\ outline_engine, \ thresh_coeff=None, \ run_local=False,**kwargs): vid = vid_from_cfg(cfg) donebase,vs = donebase_from_param(vid, \ nframes,nparts,nstep,seglen, \ ground_improvement,ground_suppress,\ outline_engine, \ thresh_coeff=thresh_coeff,return_vs=True) attempts_dir = donebase + '-ATTEMPTS' if thresh_coeff: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -tc %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % ( seglen, default_start, nframes, nparts, nstep, param_queue, thresh_coeff, ground_improvement, ground_suppress, outline_engine, cfg, vs, MAX_ITERTIME, vid) else: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % ( seglen, default_start, nframes, nparts, nstep, param_queue, ground_improvement, ground_suppress, outline_engine, cfg, vs, MAX_ITERTIME, vid) logfile = donebase + '.lsflog' ss = run_safe.safe_script(cmd, donebase, force_write=True) subcmd = 'bsub -q %s -o %s %s' % (QUEUE, logfile, ss) #print >> sys.stderr, '\n\t',subcmd if run_local: ret = os.system(ss) else: ret = os.system(subcmd) if ret == 0: if not os.path.exists(attempts_dir): os.makedirs(attempts_dir) at_ret = os.system('touch %s' % os.path.join( attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S'))) if ret != 0: print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join( attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S')) else: errstr = 'submission of job failed:\n%s' % subcmd raise OSError, errstr
def get_crop_config_write(vid,cropsdict,start_tup=None,end_tup=None,queue='hoekstra'): '''given video and cropsdict (see get_crop_config_show) and optional start and end (should be (h,m,s) tuples or None) write config and submit cropping run.''' cdf = os.path.join(os.path.dirname(vid),'cropsdict.dict') open(cdf,'w').write(cropsdict.__repr__()) if start_tup is None and end_tup is None: cmd = 'vid2crop.py %s %s' % (vid,cdf) else: offset = start_tup and sec_from_tup(start_tup) or 0 endtime = end_tup and sec_from_tup(end_tup) or vidtools.vid_duration(vid) dur = endtime - offset cmd = 'vid2crop.py %s %s %s %s' % (vid,offset,dur,cdf) donebase = os.path.splitext(vid)[0]+'-vid2crop' ss = run_safe.safe_script(cmd,donebase,force_write=True) bsub_cmd = 'bsub -q %s -o %s.lsflog %s' % (queue,donebase,ss) os.system(bsub_cmd)
def map_by_bwa(reads, reference, mapped, bwa_args='', make_index=True, force_index=False): if make_index: make_bwa_ref_idx(reference, force_index=force_index) if reads.endswith('.bam'): cmd = 'samtools bam2fq %s | bwa mem %s %s - | samtools view -bS - > %s.bam' % ( reads, bwa_args, reference, mapped) else: cmd = 'bwa mem %s %s %s | samtools view -bS - > %s.bam' % ( bwa_args, reference, reads, mapped) ss = run_safe.safe_script(cmd, mapped, force_write=True) ret = os.system(ss) if ret == 0 and os.path.exists(mapped + '.bam'): print >> sys.stderr, '%s.bam created' % mapped else: errstr = 'mapping %s to %s failed' % (reads, reference) raise OSError, errstr return mapped + '.bam'
def submit_one(cfg, default_start, \ nframes,nparts,nstep,param_queue,seglen, \ ground_improvement,ground_suppress,\ outline_engine, \ thresh_coeff=None, \ run_local=False,**kwargs): vid = vid_from_cfg(cfg) donebase,vs = donebase_from_param(vid, \ nframes,nparts,nstep,seglen, \ ground_improvement,ground_suppress,\ outline_engine, \ thresh_coeff=thresh_coeff,return_vs=True) attempts_dir = donebase+'-ATTEMPTS' if thresh_coeff: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -tc %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,thresh_coeff,ground_improvement,ground_suppress,outline_engine,cfg,vs, MAX_ITERTIME, vid) else: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,ground_improvement,ground_suppress,outline_engine,cfg,vs,MAX_ITERTIME, vid) logfile = donebase+'.lsflog' ss = run_safe.safe_script(cmd,donebase,force_write=True) subcmd = 'bsub -q %s -o %s %s' % (QUEUE,logfile,ss) #print >> sys.stderr, '\n\t',subcmd if run_local: ret = os.system(ss) else: ret = os.system(subcmd) if ret == 0: if not os.path.exists(attempts_dir): os.makedirs(attempts_dir) at_ret = os.system('touch %s' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S'))) if ret != 0: print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S')) else: errstr = 'submission of job failed:\n%s' % subcmd raise OSError, errstr
def submit_runs(vidroot,default_start, \ nframes,nparts,nstep,param_queue,seglen, \ ground_improvement,ground_suppress, \ outline_engine, \ num_jobs_running_max,num_jobs_new_max,skip_fn=skip_fn): num_current = get_current_run_count() num_new = num_jobs_running_max - num_current if num_new < 1: print >> sys.stderr, 'number of currently running jobs (%s) meets or exceeds max concurrent (%s)' % ( num_current, num_jobs_running_max) return None else: launched = 0 for cfg in sorted(glob(os.path.join(vidroot, '*/*-config.dict'))): if launched == num_new or launched >= num_jobs_new_max: break currjobs = Popen('bjobs -w', shell=True, stdout=PIPE).stdout.read() print >> sys.stderr, cfg, '\t', vid = cfg.split('-config')[0] + '.mp4' if not os.path.exists(vid): print >> sys.stderr, 'video removed; skipping' continue if os.path.exists(skip_fn(vid)): print >> sys.stderr, 'skip flag found; skipping' continue if vidtools.vid_duration( vid) < MIN_VID_DUR: #only analyze videos longer than 8hrs print >> sys.stderr, 'too short; skip' continue donebase = '%s-l%snp%snf%sns%sgi%sgs%soe%s' % ( vid[:-4], seglen, nparts, nframes, nstep, ground_improvement, ground_suppress, outline_engine) vs = 'np%snf%sns%sgi%sgs%soe%s' % (nparts, nframes, nstep, ground_improvement, ground_suppress, outline_engine) attempts_dir = donebase + '-ATTEMPTS' if os.path.exists(donebase + '.done'): print >> sys.stderr, 'done' elif donebase in currjobs: print >> sys.stderr, 'running' elif os.path.exists(attempts_dir) and len( glob(os.path.join(attempts_dir, 'attempt*'))) >= MAX_RETRY: nrrc = next_rerun_condition(cfg,RERUN_COEFFS, \ nframes,nparts,nstep,seglen, \ ground_improvement,ground_suppress, \ outline_engine,return_state=True) if nrrc is None: print >> sys.stderr, 'too many attempts (%s) for all conditions (%s); see %s' % ( len(glob(os.path.join(attempts_dir, 'attempt*'))), RERUN_COEFFS, attempts_dir) else: thresh_coeff, state = nrrc print >> sys.stderr, 'rerun %s %s' % (thresh_coeff, state) else: cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s %s' % ( seglen, default_start, nframes, nparts, nstep, param_queue, ground_improvement, ground_suppress, outline_engine, cfg, vs, vid) logfile = donebase + '.lsflog' ss = run_safe.safe_script(cmd, donebase, force_write=True) subcmd = 'bsub -q %s -o %s %s' % (QUEUE, logfile, ss) #print >> sys.stderr, '\n\t',subcmd ret = os.system(subcmd) launched += 1 if ret == 0: if not os.path.exists(attempts_dir): os.makedirs(attempts_dir) at_ret = os.system('touch %s' % os.path.join( attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S'))) if ret != 0: print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join( attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S')) else: errstr = 'submission of job failed:\n%s' % subcmd raise OSError, errstr
cmd_by_sam[sampart] = cmdstr if opts.debug: print 'COMMANDS FOLLOW:\n'+'\n'.join(cmds) else: #SLURM example logfile = os.path.join(outroot,'%slog' % opts.scheduler,'stampy-%s-%s-log' % (bp,tb)) schedule_jobs(cmd_by_sam,opts.scheduler,'stampy',logfile,opts.lsf_queue,requeue=opts.fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(opts.gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>20000]"',MAX_RETRY=MAX_RETRY) #MERGE SAM PARTS FROM STAMPY cmds = [] mergecmds_by_bam = {} for bam,sams in samparts_by_bam.items(): cmd = 'merge_sams_with_validation.py %s %s' % (bam,' '.join(sams)) cmds.append(cmd) mergecmds_by_bam[bam] = run_safe.safe_script(cmd,bam) if opts.debug: print 'COMMANDS FOLLOW:\n'+'\n'.join(cmds) else: #SLURM here logfile = os.path.join(outroot,'%slog' % opts.scheduler,'merge-%s-%s-log' % (bp,tb)) schedule_jobs(mergecmds_by_bam,opts.scheduler,'stampy-merge',logfile,opts.lsf_queue,requeue=opts.fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(opts.gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>20000]"',MAX_RETRY=MAX_RETRY) #LSF.lsf_run_until_done(mergecmds_by_bam,logfile,opts.lsf_queue,'-R "select[mem>20000]"','stampy-merge',njobs,MAX_RETRY) if opts.cleanup: print >> sys.stderr, 'remove %s .sam part files' % len(cmd_by_sam) for i,f in enumerate(cmd_by_sam.keys()): os.unlink(f) os.unlink(f+'.done') print >> sys.stderr,'\r%s' % (i+1),
def realign_bams_lsf(bams,ref,outroot,njobs,min_ind_realign,queue='normal_serial',job_ram='20000',targetcreator_opts='--maxIntervalSize 5000',gatk_jar=gatk_jar,gatk_ram=8,force_links=False,MAX_RETRY=MAX_RETRY,fallback_queue=''): '''force_links replaces existing symlinks job_ram in MB gatk_ram in GB duration hardcoded to opts.max_job_duration; should be an argument ''' realign_root = os.path.join(outroot,'realign') intervals_parts_root = os.path.join(realign_root,'parts') intervals_file = os.path.join(intervals_parts_root,'all_part.RealignTargets.intervals') try: os.makedirs(intervals_parts_root) except: pass os.chdir(realign_root) bams_to_link = {} link_to_realign = {} realign_to_srcroot = {} bams_to_workdir = {} realigned_bams = [] for bam in bams: bamsrcroot,bambase = os.path.split(bam) fcroot = os.path.basename(bamsrcroot) bamlink = os.path.join(realign_root,fcroot,bambase) realigned = bamlink[:-4]+'.realigned.bam' realignedlink = bam[:-4]+'.realigned.bam' realignedidx = bamlink[:-4]+'.realigned.bai' realignedlinkidx = bam[:-4]+'.realigned.bai' realigned_bams.append(realignedlink) if force_links: try: os.unlink(realignedlink) except: pass try: os.unlink(realignedlinkidx) except: pass if not os.path.exists(realignedlink): bams_to_link[bam] = bamlink link_to_realign[bamlink] = realigned realign_to_srcroot[realigned] = realignedlink bams_to_workdir[bam] = os.path.dirname(bamlink) if not os.path.exists(bamlink): try: os.makedirs(os.path.dirname(bamlink)) except: pass ret = os.system('ln -s %s %s' % (bam,bamlink)) if ret != 0: raise OSError, 'ln failed: %s -> %s' % (bam,bamlink) if len(bams_to_link) == 0: #nothing to do here; return links print >> sys.stderr, 'realigned bams all present' return realigned_bams #otherwise get down to business print >> sys.stderr, 'Perform realignment:' if os.path.exists(intervals_file+'.done'): print >> sys.stderr, 'using %s as intervals_file' % intervals_file else: bamstr = ' -I '.join(bams) intervals_parts_regions = partition_reference(ref,njobs) to_run_dict = {} for i,part in enumerate(intervals_parts_regions): reg_str = ' -L '.join(part) intervals_parts_file = os.path.join(intervals_parts_root,'part%s.RealignTargets.intervals' % (i)) cmd = 'java -Xmx%sg -jar %s -T RealignerTargetCreator -I %s -R %s -L %s %s -o %s' % (gatk_ram,gatk_jar,bamstr,ref,reg_str,targetcreator_opts,intervals_parts_file) to_run_dict[intervals_parts_file] = run_safe.safe_script(cmd,intervals_parts_file) #SLURM HERE logfile = os.path.join(intervals_parts_root,'logs','RealignerTargetCreator') schedule_jobs(to_run_dict,opts.scheduler,'targetcreator',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY) #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'targetcreator',njobs,MAX_RETRY) #if fallback_queue: # LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'targetcreator',njobs,MAX_RETRY) catcmd = 'cat %s > %s' % (' '.join(to_run_dict.keys()),intervals_file) ret = os.system(run_safe.safe_script(catcmd,intervals_file)) if ret != 0: raise OSError, 'cat failed: %s' % catcmd samples_per_batch = max(min_ind_realign, len(bams_to_link)/njobs) bam_batches_by_dir = [] this_batch = [] lastroot = None for bam in sorted(bams_to_link.keys()): if lastroot != bams_to_workdir[bam] or len(this_batch) == samples_per_batch: if this_batch: bam_batches_by_dir.append((lastroot,this_batch)) this_batch = [] lastroot = bams_to_workdir[bam] this_batch.append(bam) if this_batch: #process last batch bam_batches_by_dir.append((lastroot,this_batch)) if len(bam_batches_by_dir) == 0: print >> sys.stderr, 'realignments present' else: print >> sys.stderr, 'REALIGNMENT BATCH SUMMARY:' for workdir,bam_batch in bam_batches_by_dir: print >> sys.stderr, '\tbams: %s working: %s' % (len(bam_batch),workdir) #for bam in bam_batch: # print >> sys.stderr, '\t\t%s' % bam to_run_dict = {} for i,(workdir,bam_batch) in enumerate(bam_batches_by_dir): bamstr = ' -I '.join(bam_batch) donefile = os.path.join(realign_root,'realign_batch%sof%s' % (i,len(bam_batches_by_dir))) cmd = 'cd %s; java -Xmx%sg -jar %s -T IndelRealigner -model USE_SW -I %s -R %s --targetIntervals %s -nWayOut .realigned.bam' % (workdir,gatk_ram,gatk_jar,bamstr,ref,intervals_file) to_run_dict[donefile] = run_safe.safe_script(cmd,donefile) #SLURM here logfile = os.path.join(realign_root,'logs','IndelRealigner') schedule_jobs(to_run_dict,opts.scheduler,'realigner',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY) #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'realigner',njobs,MAX_RETRY) #if fallback_queue: # LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'realigner',njobs,MAX_RETRY) for realigned,realignedlink in realign_to_srcroot.items(): if not os.path.exists(realignedlink): ret = os.system('ln -s %s %s' % (realigned,realignedlink)) if ret != 0: raise OSError, 'ln failed: %s -> %s' % (realigned,realignedlink) if not os.path.exists(realignedlink[:-1]+'i'): ret = os.system('ln -s %s %s' % (realigned[:-1]+'i',realignedlink[:-1]+'i')) if ret != 0: raise OSError, 'ln failed: %s -> %s' % (realigned[:-1]+'i',realignedlink[:-1]+'i') return realigned_bams
def call_variants_gatk_lsf(bams,ref,outroot,vcfbase,njobs=100,gatk_program='UnifiedGenotyper',gatk_args='-out_mode EMIT_ALL_CONFIDENT_SITES -dcov 200 -glm BOTH',gatk_jar=gatk_jar,gatk_ram=4,tmpdir=None,queue='normal_serial',job_ram='30000',MAX_RETRY=MAX_RETRY,include_regions=None,compress_vcf=True,fallback_queue='',scheduler=None,duration=None): if duration is None: try: duration = opts.max_job_duration except: duration = DURATION_DFAULT if scheduler is None: scheduler = 'slurm' if tmpdir is None: tmpdir = os.path.join(outroot,'gatk_tmp') bamstr = ' -I '.join(bams) regions = partition_reference(ref,njobs,include_regions) vcfbasename = vcfbase.endswith('.vcf') and vcfbase[:-4] or vcfbase gatkoutvcfbase = '%s-GATK-%s' % (vcfbasename,gatk_program) if compress_vcf: vcfext = '.vcf.gz' else: vcfext = '.vcf' gatkoutvcf = os.path.join(outroot,gatkoutvcfbase+vcfext) vcf_parts_root = os.path.join(outroot,gatkoutvcfbase+'-vcf_parts') try: os.makedirs(vcf_parts_root) except: pass logfile = os.path.join(vcf_parts_root,'logs',gatk_program) ser_to_run_dict = {} if scheduler == 'slurm': par_to_run_dict = {} print >> sys.stderr, 'Calculate %s runs: ' % gatk_program for i,reg in enumerate(regions): print >> sys.stderr, '\r\t%s / %s' % (i+1,len(regions)), reg = [r for r in reg if not r.split(':')[0] in skip_contigs] if len(reg) == 0: continue start,end = start_end_strs(reg) regstr = ' -L '.join(reg) partvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s%s' % (gatkoutvcfbase,i,len(regions),start,end,vcfext)) part_sh = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.sh' % (gatkoutvcfbase,i,len(regions),start,end)) cmd = 'java -Xmx%sg -Djava.io.tmpdir=%s -jar %s -R %s -T %s -o %s %s -I %s -L %s' % (gatk_ram,tmpdir,gatk_jar,ref,gatk_program,partvcf,gatk_args,bamstr,regstr) #open(part_sh,'w').write('#!/usr/bin/env bash\n'+cmd+'\n') #os.system('chmod +x %s' % part_sh) if scheduler == 'slurm': nprevsub = len(SLURM.previous_submissions(logfile,partvcf+'.sh')) if nprevsub < MAX_RETRY: ser_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True) else: duration=MAX_DURATION print >> sys.stderr, '\n%s failed %s previous runs; %s thread X %s core invoked' % (partvcf,nprevsub,GATK_PAR_NT,GATK_PAR_NCT) cmd += ' -nt %s -nct %s' % (GATK_PAR_NT,GATK_PAR_NCT) par_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True) else: ser_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True) #SLURM here #SERIAL (one core) RUNS schedule_jobs(ser_to_run_dict,scheduler,gatk_program,logfile,queue,requeue=fallback_queue,njobs=njobs,duration=duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY) trd_keys = ser_to_run_dict.keys() #PARALLEL (multithread) RUNS if scheduler == 'slurm': mt_cores = GATK_PAR_NT*GATK_PAR_NCT mt_ram = ( (GATK_PAR_NT*gatk_ram*1024)+(JOB_MEM_OVERHEAD*GATK_PAR_NT) ) / float(mt_cores) mt_ram = int(mt_ram) print >> sys.stderr, '\nrun multithreaded %s: %s jobs; ram-per-core: %s cores: %s' % (gatk_program,len(par_to_run_dict),mt_ram,mt_cores) schedule_jobs(par_to_run_dict,scheduler,gatk_program,logfile,queue,requeue=fallback_queue,njobs=njobs,duration=duration,mem=mt_ram,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY,slurm_cores=mt_cores) trd_keys.extend(par_to_run_dict.keys()) #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'gatk',njobs,MAX_RETRY) #if fallback_queue: # LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'gatk',njobs,MAX_RETRY) cmd = merge_vcf_parts_cmd(trd_keys,ref,gatkoutvcf,gatk_jar,gatk_ram,tmpdir) ret = os.system(run_safe.safe_script(cmd,gatkoutvcf)) if ret != 0: raise OSError, 'VCF merge failed:\n%s' % cmd
prev_fq = '%s.fq%s-%s%s' % (fqbase,lnum,baseQ,fqext) print >> sys.stderr, 'must be 4-line, base 33 fastq to proceed; convert\nnew file will be %s\noriginal kept as %s\n' % (fq,prev_fq) save_previous_and_covert(prev_fq,fq) adapterstype = get_adapterstype(opts.flowcell,opts.lane,opts.index) adaptseq = get_adaptseq() adaptA,adaptB = adaptseq[adapterstype]['r1'],adaptseq[adapterstype]['r2'] print >> sys.stderr, 'use adapterstype: %s\nadaptA: %s\nadaptB: %s' % (adapterstype,adaptA,adaptB) #run seqprep if opts.seqprep_base: sp_base = opts.seqprep_base else: sp_base = 'Sample_lane%s_%s' % (opts.lane, opts.index and opts.index or 'noidx') sp_fullbase = os.path.join(os.path.dirname(opts.infiles[0]),sp_base) merge,trim1,trim2 = overlap_by_seqprep(opts.infiles[0],opts.infiles[1],sp_fullbase,pct_id=opts.percent_id,min_ol=opts.overlap_length,adaptA=adaptA,adaptB=adaptB) cmd = 'preprocess_radtag_lane.py -iq 33 -suf merge %s -fc %s -l %s %s %s' % (opts.preprocess_argstr,opts.flowcell,opts.lane,(opts.index and '-idx %s' % opts.index or ''),merge) print >> sys.stderr, cmd ss = run_safe.safe_script(cmd,sp_fullbase+'-preprocess_merge',force_write=True) ret = os.system(ss) if ret != 0 or not os.path.exists(sp_fullbase+'-preprocess_merge.done'): raise OSError, 'merge preprocess failed' cmd = 'preprocess_radtag_lane.py -iq 33 -suf trim %s -fc %s -l %s %s %s %s' % (opts.preprocess_argstr,opts.flowcell,opts.lane,(opts.index and '-idx %s' % opts.index or ''),trim1,trim2) print >> sys.stderr, cmd ss = run_safe.safe_script(cmd,sp_fullbase+'-preprocess_trim',force_write=True) ret = os.system(ss) if ret != 0 or not os.path.exists(sp_fullbase+'-preprocess_merge.done'): raise OSError, 'trim preprocess failed'
outvid = '%s_%s_%s-%s%s' % (outbase,clab,offset,dur,outext) if os.path.exists(outvid) and not os.path.getsize(outvid) == 0 and ( vidtools.vid_duration(outvid) == dur ): print >> sys.stderr, '%s present and expected size, skip' % outvid else: if FORCE_PAR: h,w = vidtools.extract_keyframe(vid).shape th = h - (crops[1]+crops[3]) tw = w - (crops[0]+crops[2]) pixw = 255 pixh = int((float(th)/tw)*pixw) parstr = '-aspect %s:%s' % (pixw,pixh) else: parstr = '' cropstr = '-vf crop=in_w-%s:in_h-%s:%s:%s' % (crops[0]+crops[2],crops[1]+crops[3],crops[0],crops[1]) cmd = 'ffmpeg -ss %s -t %s -i %s -y %s -r 29.97 -b 20000k %s %s' % (offset,dur,vid,cropstr,parstr,outvid) to_run_dict[outvid] = run_safe.safe_script(cmd,outvid,force_write=True) logfile = os.path.join(os.path.dirname(vid),'logs','crop-log') LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'crop-ffmpeg',10, MAX_RETRY) #cmds = [] #rerun = True #while rerun: # for clab,crops in cropsdict.items(): # outbase,outext = os.path.splitext(vid) # outvid = '%s_%s_%s-%s%s' % (outbase,clab,offset,dur,outext) # if os.path.exists(outvid) and ( vidtools.vid_duration(outvid) == dur ): # print >> sys.stderr, '%s present and expected size, skip' % outvid # else: # cropstr = '-vf crop=in_w-%s:in_h-%s:%s:%s' % (crops[0]+crops[2],crops[1]+crops[3],crops[0],crops[1]) # cmd = 'ffmpeg -ss %s -t %s -i %s -y %s -b 20000k %s' % (offset,dur,vid,cropstr,outvid)
intervals_file = bam_base + '.RealignTargets.intervals' realigned_bam = bam_base + '.realigned.bam' reduced_bam = bam_base + '.realigned.reduced.bam' intervals_parts_regions = partition_reference(ref,njobs) intervals_parts = [] for i,part in enumerate(intervals_parts_regions): #reg_str = ' -L '.join(part) reg_parts_file = os.path.join(workdir,'part%s.intervals' % (i)) open(reg_parts_file,'w').writelines([p+'\n' for p in part if not p.split(':')[0] in skip_contigs]) intervals_parts_file = os.path.join(workdir,'part%s.RealignTargets.intervals' % (i)) rtc_part_cmd = 'java -Xmx%sg -jar %s -T RealignerTargetCreator -I %s -R %s -L %s %s -o %s' % \ (gatk_ram,gatk_jar,bam,ref,reg_parts_file,targetcreator_opts,intervals_parts_file) rtc_ss = run_safe.safe_script(rtc_part_cmd,intervals_parts_file,force_write=True) ret = os.system(rtc_ss) if ret == 0 and os.path.exists(intervals_parts_file): print >> sys.stderr, '%s / %s complete' % (i+1,len(intervals_parts_regions)) else: errstr = 'failed on %s' % intervals_parts_file raise OSError,errstr intervals_parts.append(intervals_parts_file) cat_cmd = 'cat %s > %s' % (' '.join(intervals_parts),intervals_file) cat_ss = run_safe.safe_script(cat_cmd,intervals_file,force_write=True) ret = os.system(cat_ss) if ret == 0: print >> sys.stderr, 'intervals parts concatenation finished' else:
jobname_base = 'preprocess' logbase = os.path.join(opts.outroot,'slurmlog','preprocess') print >> sys.stderr, 'run %s logs in %s' % (jobname_base,logbase) SLURM.run_until_done(to_run_dict,jobname_base,logbase,opts.max_job_duration,(opts.job_ram+1)*1024,opts.num_batches,opts.queue,MAX_RETRY=MAX_RETRY) #collect individual fastq/fastq pairs #(LATER: GET INDIVIDUAL FILES BY DB LOOKUP; REQUIRES HANDLING MOUSE DB ID LOOKUP IF SET) fq_to_run = sample_fq_from_expected(expected_fq_d) #print fq_to_run map_reads_cmd = map_reads_exec + ' -gr %s -n %s -q %s -sched %s -mjd %s -v %s -s \'"%s"\' -g \'"%s"\' -gh \'"%s"\' -mp \'"%s"\' %s %s %s ' % \ (opts.job_ram, \ opts.num_batches, \ opts.queue, \ opts.scheduler, \ opts.max_job_duration, \ vcfname, \ opts.stampy_argstr, \ opts.gatk_argstr, \ opts.gatkhaplo_argstr, \ opts.mpileup_argstr, \ opts.mapreads_argstr, \ opts.reference_fasta, \ opts.outroot) map_reads_cmd += ' '.join(fq_to_run) print >> sys.stderr, 'run map_reads in %s' % (os.path.join(opts.outroot,vcfname)) map_reads_ss = run_safe.safe_script(map_reads_cmd,os.path.join(opts.outroot,vcfname),force_write=True) ret = os.system(map_reads_ss)
picardRAM = 2 max_temp = 1000 max_records = 1500000 MAX_PER_RUN = 100 RM_SAMS = False #always overridden to True for sam/bams created as intermediates in large merge sets #revolting hack if len(sams) > MAX_PER_RUN: sams1 = sams[:len(sams)/2] sams2 = sams[len(sams)/2:] bam1 = bam+'-1.bam' bam2 = bam+'-2.bam' for b,s in [(bam1,sams1),(bam2,sams2)]: print >> sys.stderr, '\npre-merge %s (%s parts)\n' % (b,len(s)) cmd = 'merge_sams_with_validation.py %s %s' % (b,' '.join(s)) ss = run_safe.safe_script(cmd,b,force_write=True) print >> sys.stderr, ss ret = os.system(ss) if ret != 0: raise OSError, 'pre-merge failed' sams = [bam1,bam2] RM_SAMS = True mergecmd = 'java -Xmx%sg -jar %sMergeSamFiles.jar INPUT=%s OUTPUT=%s MERGE_SEQUENCE_DICTIONARIES=true VALIDATION_STRINGENCY=LENIENT; samtools index %s' % (picardRAM,picard_root,' INPUT='.join(sams), bam, bam) ret = os.system(mergecmd) if ret == 0: print >> sys.stderr, '\nmerge complete\n' else: print >> sys.stderr, '\nfailed:\n',mergecmd raise OSError, 'merge failed for %s' % bam
# SUBMIT RUNS import os,sys,re from subprocess import Popen, PIPE from glob import glob seglen = 1800 q = 'unrestricted_serial' for cfg in sorted(glob('*/*-config.dict')): currjobs = Popen('bjobs -w',shell=True,stdout=PIPE).stdout.read() print >> sys.stderr, cfg,'\t', vid = cfg.split('-config')[0]+'.mp4' if vidtools.vid_duration(vid) < 8*60*60: #only analyze videos longer than 8hrs print >> sys.stderr, 'too short; skip' continue donebase = '%s-l%snp60nf300ns4' % (vid[:-4],seglen) if os.path.exists(donebase+'.done'): print >> sys.stderr, 'done' elif donebase in currjobs: print >> sys.stderr, 'running' else: cmd = 'summarize_segment_opencv.py -l %s -s 60 -nf 300 -np 60 -ns 4 -gi 0.03 -oe shapely -ac %s -vs np60nf300ns4shapely %s' % (seglen,cfg,vid) logfile = donebase+'.lsflog' ss = run_safe.safe_script(cmd,donebase,force_write=True) subcmd = 'bsub -q %s -o %s %s' % (q,logfile,ss) ret = os.system(subcmd)
sp_fullbase = os.path.join(os.path.dirname(opts.infiles[0]), sp_base) merge, trim1, trim2 = overlap_by_seqprep(opts.infiles[0], opts.infiles[1], sp_fullbase, pct_id=opts.percent_id, min_ol=opts.overlap_length, adaptA=adaptA, adaptB=adaptB) cmd = 'preprocess_radtag_lane.py -iq 33 -suf merge %s -fc %s -l %s %s %s' % ( opts.preprocess_argstr, opts.flowcell, opts.lane, (opts.index and '-idx %s' % opts.index or ''), merge) print >> sys.stderr, cmd ss = run_safe.safe_script(cmd, sp_fullbase + '-preprocess_merge', force_write=True) ret = os.system(ss) if ret != 0 or not os.path.exists(sp_fullbase + '-preprocess_merge.done'): raise OSError, 'merge preprocess failed' cmd = 'preprocess_radtag_lane.py -iq 33 -suf trim %s -fc %s -l %s %s %s %s' % ( opts.preprocess_argstr, opts.flowcell, opts.lane, (opts.index and '-idx %s' % opts.index or ''), trim1, trim2) print >> sys.stderr, cmd ss = run_safe.safe_script(cmd, sp_fullbase + '-preprocess_trim', force_write=True) ret = os.system(ss) if ret != 0 or not os.path.exists(sp_fullbase + '-preprocess_merge.done'): raise OSError, 'trim preprocess failed'