def call_variants_mpileup_lsf(bams,ref,outroot,vcfbase,njobs=100,mpileup_args='',gatk_jar=gatk_jar,gatk_ram=8,tmpdir=None,queue='normal_serial',job_ram='30000',MAX_RETRY=MAX_RETRY,include_regions=None,fallback_queue=''):
    if tmpdir is None:
        tmpdir = os.path.join(outroot,'gatk_tmp')

    bamstr = ' -I '.join(bams)
    regions = partition_reference(ref,njobs,include_regions)
    vcfbasename = vcfbase.endswith('.vcf') and vcfbase[:-4] or vcfbase
    mpoutvcfbase = '%s-mpileup' % (vcfbasename)
    mpoutvcf = os.path.join(outroot,mpoutvcfbase+'.vcf')
    vcf_parts_root = os.path.join(outroot,mpoutvcfbase+'-vcf_parts')
    try:
        os.makedirs(vcf_parts_root)
    except:
        pass

    to_run_dict = {}
    #merge_subparts_trd = {}
    subparts = []
    for i,reg in enumerate(regions):
        start,end = start_end_strs(reg)
        #regstr = ' -L '.join(reg)
        partdonebase = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s-parts' % (mpoutvcfbase,i,len(regions),start,end))
        partvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.vcf' % (mpoutvcfbase,i,len(regions),start,end))
        part_sh = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.sh' % (mpoutvcfbase,i,len(regions),start,end))
        #cmd = 'java -Xmx%sg -Djava.io.tmpdir=%s -jar  %s -R %s -T %s -o %s %s -I %s -L %s' % (gatk_ram,tmpdir,gatk_jar,ref,gatk_program,partvcf,gatk_args,bamstr,regstr)

        this_trd = {}
        for this_reg in reg:
            subpart = this_reg.split(':')[0]
            subpartvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s-%s.vcf' % (mpoutvcfbase,i,len(regions),start,end,subpart))
            this_cmd = 'samtools mpileup -Dgu -r %s %s -f %s %s | bcftools view -cvg  - > %s 2>  %s.log' % (this_reg,mpileup_args,ref,bamstr,subpartvcf,subpartvcf)
            this_trd[subpartvcf] = run_safe.safe_script(this_cmd,subpartvcf)
            
        cmd_parts = unfinished_cmds(this_trd)
        cmd = '; '.join(cmd_parts)
        #open(part_sh,'w').write('#!/usr/bin/env bash\n'+cmd+'\n')
        #os.system('chmod +x %s' % part_sh)
        to_run_dict[partdonebase] = run_safe.safe_script(cmd,partdonebase)
        subparts.extend(this_trd.keys())
        #vcfparts = this_trd.keys() ### <---MAKE THIS WORK (merge in parts before merge all)
        #merge_subparts_trd[partvcf] = run_safe.safe_script(merge_vcf_parts_cmd(vcfparts,ref,partvcf,gatk_jar,gatk_ram,tmpdir,rod_type = ':VCF'),partvcf)

    #SLURM here
    logfile = os.path.join(vcf_parts_root,'logs','mpileup-parts')
    schedule_jobs(to_run_dict,opts.scheduler,'mpileup',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY)
    #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'mpileup',njobs,MAX_RETRY)

    cmd = run_safe.safe_script(merge_vcf_parts_cmd(subparts,ref,mpoutvcf,gatk_jar,gatk_ram,tmpdir),mpoutvcf)
    ret = os.system(cmd)
    if ret != 0:
        raise OSError, 'VCF merge failed:\n%s' % cmd
Example #2
0
def run_parallel_blat(subjects,queries,blattile,blatargstr='',num_cores='+0'):
    '''
    runs blat commands using GUN parallel.

    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile)/2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
            labf.append(outbase+'.label.gz')
            cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase)
            cmds.append(run_safe.safe_script(cmd,outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
    os.system('chmod +x '+shscr)
    ret = os.system('parallel --progress -j %s < %s' % (num_cores,shscr))
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret

    return labf
Example #3
0
def run_lsf_blat(subjects,
                 queries,
                 blattile,
                 blatargstr='',
                 num_batches=100,
                 queue='normal_serial'):
    '''submits mcl_id_triples_by_blat.py jobs to LSF

    intended as an example of parallelization over a compute grid;
    uses a module LSF.py for interaction with scheduler

    '''
    import LSF, run_safe

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile) / 2)

    #cmds = []
    labf = []
    to_run_dict = {}
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip(
                '_query') + '_blat' + '-subj' + subjname + blatargstr.replace(
                    '=', '').replace(' ', '')
            labf.append(outbase + '.label.gz')
            # ESCAPES UNNECESSARY WITH safe_script
            #cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
            cmd = '%s %s %s %s "%s" %s' % (
                sys.executable,
                os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),
                subject, q, blatargstr, outbase)
            to_run_dict[outbase] = run_safe.safe_script(cmd, outbase)

    logfile = os.path.join(os.path.dirname(subjects[0]), 'blat-log/blat-log')
    LSF.lsf_run_until_done(to_run_dict, logfile, queue,
                           '-R "select[mem > 20000]"', 'blat2mat', num_batches,
                           3)

    # REPLACED BY lsf_run_until_done ABOVE
    #logfiles = glob(logfile+'*.lsflog')
    #for lf in logfiles:
    #    try:
    #        os.unlink(lf)
    #    except:
    #        pass
    #print >> sys.stderr, 'LSF %s\nlog: %s' % (cmds,logfile)
    #import time
    #while len(cmds) > 0:
    #    jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches)
    #    time.sleep(20)
    #    LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict)
    #    logfiles = glob(logfile+'*.lsflog')
    #    cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles])

    if not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed'

    return labf
Example #4
0
def run_local_blat(subjects,queries,blattile,blatargstr='',num_cores=1):
    '''
    runs blat commands using os.system()
    runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat()
    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile)/2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
            labf.append(outbase+'.label.gz')
            cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (radtag_denovo,subject,q,blatargstr,outbase)
            cmds.append(run_safe.safe_script(cmd,outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]) , 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd+';\n' for cmd in cmds])
    os.system('chmod +x '+shscr)
    ret = os.system(shscr)
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret
    return labf
Example #5
0
def run_parallel_blat(subjects,
                      queries,
                      blattile,
                      blatargstr='',
                      num_cores='+0'):
    '''
    runs blat commands using GUN parallel.

    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile) / 2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip(
                '_query') + '_blat' + '-subj' + subjname + blatargstr.replace(
                    '=', '').replace(' ', '')
            labf.append(outbase + '.label.gz')
            cmd = '%smcl_id_triples_by_blat.py %s %s "%s" %s' % (
                radtag_denovo, subject, q, blatargstr, outbase)
            cmds.append(run_safe.safe_script(cmd, outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds])
    os.system('chmod +x ' + shscr)
    ret = os.system('parallel --progress -j %s < %s' % (num_cores, shscr))
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret

    return labf
Example #6
0
def run_local_blat(subjects, queries, blattile, blatargstr='', num_cores=1):
    '''
    runs blat commands using os.system()
    runs all jobs as a single batch, to run on multiple cores/computers, consider run_parallel_blat()
    '''

    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile) / 2)

    cmds = []
    labf = []
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip(
                '_query') + '_blat' + '-subj' + subjname + blatargstr.replace(
                    '=', '').replace(' ', '')
            labf.append(outbase + '.label.gz')
            cmd = '%s %s %s %s "%s" %s' % (
                sys.executable,
                os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),
                subject, q, blatargstr, outbase)
            cmds.append(run_safe.safe_script(cmd, outbase))

    shscr = os.path.join(os.path.dirname(subjects[0]), 'runblat.sh')
    smartopen(shscr, 'w').writelines([cmd + ';\n' for cmd in cmds])
    os.system('chmod +x ' + shscr)
    ret = os.system(shscr)
    if ret != 0 or not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed with code %s' % ret
    return labf
Example #7
0
def overlap_by_seqprep(r1,
                       r2,
                       outbase,
                       pct_id=0.8,
                       min_ol=10,
                       adaptA='GATCGGAAGAGCACACG',
                       adaptB='AGATCGGAAGAGCGTCGT'):
    '''adaptA is the adapter 1 sequence AS IT APPEARS IN READ 1,
    likewise adaptB is adapter 2 sequence AS IT APPEARS IN READ 2.
    in other words, DB_adapt_trim_seqs r1 (read 1 adapter read-through) is A, r2 is B
    '''
    trim1 = outbase + '.R1.trim.fastq.gz'
    trim2 = outbase + '.R2.trim.fastq.gz'
    drop1 = outbase + '.R1.drop.fastq.gz'
    drop2 = outbase + '.R2.drop.fastq.gz'
    merge = outbase + '.merge.fastq.gz'
    aln = outbase + '.merge.aln.gz'
    cmd = 'SeqPrep -f %s -r %s -1 %s -2 %s -3 %s -4 %s -A %s -B %s -s %s -E %s -o %s -m %s -n %s' % (
        r1, r2, trim1, trim2, drop1, drop2, adaptA, adaptB, merge, aln, min_ol,
        1 - pct_id, pct_id)
    ss = run_safe.safe_script(cmd, outbase + '-seqprep', force_write=True)
    ret = os.system(ss)
    if ret != 0:
        raise OSError, 'seqprep run failed'
    return merge, trim1, trim2
def submit_runs(vidroot,default_start, \
                nframes,nparts,nstep,param_queue,seglen, \
                ground_improvement,ground_suppress, \
                outline_engine, \
                num_jobs_running_max,num_jobs_new_max,skip_fn=skip_fn):
    num_current = get_current_run_count()
    num_new = num_jobs_running_max - num_current
    if num_new < 1:
        print >> sys.stderr, 'number of currently running jobs (%s) meets or exceeds max concurrent (%s)' % (num_current, num_jobs_running_max)
        return None
    else:
        launched = 0
        for cfg in sorted(glob(os.path.join(vidroot,'*/*-config.dict'))):
            if launched == num_new or launched >= num_jobs_new_max: break
            currjobs = Popen('bjobs -w',shell=True,stdout=PIPE).stdout.read()
            print >> sys.stderr, cfg,'\t',
            vid = cfg.split('-config')[0]+'.mp4'
            if not os.path.exists(vid):
                print >> sys.stderr, 'video removed; skipping'
                continue
            if os.path.exists(skip_fn(vid)):
                print >> sys.stderr, 'skip flag found; skipping'
                continue
            if vidtools.vid_duration(vid) < MIN_VID_DUR: #only analyze videos longer than 8hrs
                print >> sys.stderr, 'too short; skip'
                continue
            donebase = '%s-l%snp%snf%sns%sgi%sgs%soe%s' % (vid[:-4],seglen,nparts,nframes,nstep,ground_improvement,ground_suppress,outline_engine)
            vs = 'np%snf%sns%sgi%sgs%soe%s' % (nparts,nframes,nstep,ground_improvement,ground_suppress,outline_engine)
            attempts_dir = donebase+'-ATTEMPTS'
            if os.path.exists(donebase+'.done'):
                print >> sys.stderr, 'done'
            elif donebase in currjobs:
                print >> sys.stderr, 'running'
            elif os.path.exists(attempts_dir) and len(glob(os.path.join(attempts_dir,'attempt*'))) >= MAX_RETRY:
                nrrc = next_rerun_condition(cfg,RERUN_COEFFS, \
                                            nframes,nparts,nstep,seglen, \
                                            ground_improvement,ground_suppress, \
                                            outline_engine,return_state=True)
                if nrrc is None:
                    print >> sys.stderr, 'too many attempts (%s) for all conditions (%s); see %s' % (len(glob(os.path.join(attempts_dir,'attempt*'))),RERUN_COEFFS,attempts_dir)
                else:
                    thresh_coeff,state = nrrc
                    print >> sys.stderr, 'rerun %s %s' % (thresh_coeff,state)
            else:
                cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,ground_improvement,ground_suppress,outline_engine,cfg,vs, vid)
                logfile = donebase+'.lsflog'
                ss = run_safe.safe_script(cmd,donebase,force_write=True)
                subcmd = 'bsub -q %s -o %s %s' % (QUEUE,logfile,ss)
                #print >> sys.stderr, '\n\t',subcmd
                ret = os.system(subcmd)
                launched += 1
                if ret == 0:
                    if not os.path.exists(attempts_dir): os.makedirs(attempts_dir)
                    at_ret = os.system('touch %s' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S')))
                    if ret != 0:
                        print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S'))
                else:
                    errstr = 'submission of job failed:\n%s' % subcmd
                    raise OSError, errstr
Example #9
0
def map_by_stampy(reads,reference,mapped,stampy_args='--maxbasequal=60 --bwamark',make_index=True,force_index=False):
    if make_index: make_stampy_ref_idx(reference,force_index=force_index)
    cmd = 'stampy.py --overwrite %s -h %s -g %s -M %s -o %s.sam; samtools view -bS %s.sam > %s.bam' % (stampy_args,reference,reference, reads, mapped,mapped,mapped)
    ss = run_safe.safe_script(cmd,mapped,force_write=True)
    ret = os.system(ss)
    if ret == 0 and os.path.exists(mapped+'.bam'):
        print >> sys.stderr, '%s.bam created' % mapped
    else:
        errstr = 'mapping %s to %s failed' % (reads, reference)
        raise OSError, errstr

    return mapped+'.bam'
Example #10
0
def run_lsf_blat(subjects,queries,blattile,blatargstr='',num_batches=100,queue='normal_serial'):
    '''submits mcl_id_triples_by_blat.py jobs to LSF

    intended as an example of parallelization over a compute grid;
    uses a module LSF.py for interaction with scheduler

    '''
    import LSF,run_safe
    
    blatargstr += ' -tileSize=%s' % blattile
    blatargstr += ' -stepSize=%s' % (int(blattile)/2)

    #cmds = []
    labf = []
    to_run_dict = {}
    for q in queries:
        for subject in subjects:
            subjname = os.path.basename(subject).rstrip('.fa').rstrip('_subj')
            outbase = q.rstrip('.fa').rstrip('_query')+'_blat'+'-subj'+subjname+blatargstr.replace('=','').replace(' ','')
            labf.append(outbase+'.label.gz')
            # ESCAPES UNNECESSARY WITH safe_script  
            #cmds.append('%smcl_id_triples_by_blat.py %s %s \\"%s\\" %s' % (radtag_denovo,subject,q,blatargstr,outbase))
            cmd = '%s %s %s %s "%s" %s' % (sys.executable, os.path.join(radtag_denovo, 'mcl_id_triples_by_blat.py'),subject,q,blatargstr,outbase)
            to_run_dict[outbase] = run_safe.safe_script(cmd,outbase)


    logfile = os.path.join(os.path.dirname(subjects[0]),'blat-log/blat-log')
    LSF.lsf_run_until_done(to_run_dict, logfile, queue, '-R "select[mem > 20000]"', 'blat2mat', num_batches, 3)

    # REPLACED BY lsf_run_until_done ABOVE
    #logfiles = glob(logfile+'*.lsflog')
    #for lf in logfiles:
    #    try:
    #        os.unlink(lf)
    #    except:
    #        pass
    #print >> sys.stderr, 'LSF %s\nlog: %s' % (cmds,logfile)
    #import time
    #while len(cmds) > 0:
    #    jobids,namedict = LSF.lsf_jobs_submit(cmds,logfile,'normal_serial',bsub_flags='-R "select[mem > 20000]"',jobname_base='blat2mat',num_batches=num_batches)
    #    time.sleep(20)
    #    LSF.lsf_wait_for_jobs(jobids,logfile,namedict=namedict)
    #    logfiles = glob(logfile+'*.lsflog')
    #    cmds = reduce(lambda x,y:x+y, [LSF.lsf_no_success_from_log(lf) for lf in logfiles])

    if not all([os.path.exists(f) for f in labf]):
        raise OSError, 'blat failed'

    return labf
Example #11
0
def map_by_bwa(reads,reference,mapped,bwa_args='',make_index=True,force_index=False):
    if make_index: make_bwa_ref_idx(reference,force_index=force_index)
    if reads.endswith('.bam'):
        cmd = 'samtools bam2fq %s | bwa mem %s %s - | samtools view -bS - > %s.bam' % (reads,bwa_args, reference, mapped)
    else:
        cmd = 'bwa mem %s %s %s | samtools view -bS - > %s.bam' % (bwa_args, reference, reads, mapped)

    ss = run_safe.safe_script(cmd,mapped,force_write=True)
    ret = os.system(ss)
    if ret == 0 and os.path.exists(mapped+'.bam'):
        print >> sys.stderr, '%s.bam created' % mapped
    else:
        errstr = 'mapping %s to %s failed' % (reads, reference)
        raise OSError, errstr

    return mapped+'.bam'
Example #12
0
def overlap_by_seqprep(r1,r2,outbase,pct_id=0.8,min_ol=10,adaptA='GATCGGAAGAGCACACG',adaptB='AGATCGGAAGAGCGTCGT'):
    '''adaptA is the adapter 1 sequence AS IT APPEARS IN READ 1,
    likewise adaptB is adapter 2 sequence AS IT APPEARS IN READ 2.
    in other words, DB_adapt_trim_seqs r1 (read 1 adapter read-through) is A, r2 is B
    '''
    trim1 = outbase+'.R1.trim.fastq.gz'
    trim2 = outbase+'.R2.trim.fastq.gz'
    drop1 = outbase+'.R1.drop.fastq.gz'
    drop2 = outbase+'.R2.drop.fastq.gz'
    merge = outbase+'.merge.fastq.gz'
    aln = outbase+'.merge.aln.gz'
    cmd = 'SeqPrep -f %s -r %s -1 %s -2 %s -3 %s -4 %s -A %s -B %s -s %s -E %s -o %s -m %s -n %s' % (r1,r2,trim1,trim2,drop1,drop2,adaptA,adaptB,merge,aln,min_ol,1-pct_id,pct_id)
    ss = run_safe.safe_script(cmd,outbase+'-seqprep',force_write=True)
    ret = os.system(ss)
    if ret != 0:
        raise OSError, 'seqprep run failed'
    return merge,trim1,trim2
Example #13
0
def map_by_stampy(reads,
                  reference,
                  mapped,
                  stampy_args='--maxbasequal=60 --bwamark',
                  make_index=True,
                  force_index=False):
    if make_index: make_stampy_ref_idx(reference, force_index=force_index)
    cmd = 'stampy.py --overwrite %s -h %s -g %s -M %s -o %s.sam; samtools view -bS %s.sam > %s.bam' % (
        stampy_args, reference, reference, reads, mapped, mapped, mapped)
    ss = run_safe.safe_script(cmd, mapped, force_write=True)
    ret = os.system(ss)
    if ret == 0 and os.path.exists(mapped + '.bam'):
        print >> sys.stderr, '%s.bam created' % mapped
    else:
        errstr = 'mapping %s to %s failed' % (reads, reference)
        raise OSError, errstr

    return mapped + '.bam'
def submit_one(cfg, default_start, \
               nframes,nparts,nstep,param_queue,seglen, \
               ground_improvement,ground_suppress,\
               outline_engine, \
               thresh_coeff=None, \
               run_local=False,**kwargs):

    vid = vid_from_cfg(cfg)
    donebase,vs = donebase_from_param(vid, \
                        nframes,nparts,nstep,seglen, \
                        ground_improvement,ground_suppress,\
                        outline_engine, \
                        thresh_coeff=thresh_coeff,return_vs=True)
    attempts_dir = donebase + '-ATTEMPTS'

    if thresh_coeff:
        cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -tc %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (
            seglen, default_start, nframes, nparts, nstep, param_queue,
            thresh_coeff, ground_improvement, ground_suppress, outline_engine,
            cfg, vs, MAX_ITERTIME, vid)
    else:
        cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (
            seglen, default_start, nframes, nparts, nstep, param_queue,
            ground_improvement, ground_suppress, outline_engine, cfg, vs,
            MAX_ITERTIME, vid)

    logfile = donebase + '.lsflog'
    ss = run_safe.safe_script(cmd, donebase, force_write=True)
    subcmd = 'bsub -q %s -o %s %s' % (QUEUE, logfile, ss)
    #print >> sys.stderr, '\n\t',subcmd
    if run_local:
        ret = os.system(ss)
    else:
        ret = os.system(subcmd)
    if ret == 0:
        if not os.path.exists(attempts_dir): os.makedirs(attempts_dir)
        at_ret = os.system('touch %s' % os.path.join(
            attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S')))
        if ret != 0:
            print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(
                attempts_dir, 'attempt' + time.strftime('%Y%m%d-%H%M%S'))
    else:
        errstr = 'submission of job failed:\n%s' % subcmd
        raise OSError, errstr
Example #15
0
def get_crop_config_write(vid,cropsdict,start_tup=None,end_tup=None,queue='hoekstra'):
	'''given video and cropsdict (see get_crop_config_show)
	and optional start and end (should be (h,m,s) tuples or None)
	write config and submit cropping run.'''
	cdf = os.path.join(os.path.dirname(vid),'cropsdict.dict')
	open(cdf,'w').write(cropsdict.__repr__())

	if start_tup is None and end_tup is None:
		cmd = 'vid2crop.py %s %s' % (vid,cdf)
	else:
		offset = start_tup and sec_from_tup(start_tup) or 0
		endtime = end_tup and sec_from_tup(end_tup) or vidtools.vid_duration(vid)
		dur = endtime - offset
		cmd = 'vid2crop.py %s %s %s %s' % (vid,offset,dur,cdf)
	
	donebase = os.path.splitext(vid)[0]+'-vid2crop'
	ss = run_safe.safe_script(cmd,donebase,force_write=True)
	bsub_cmd = 'bsub -q %s -o %s.lsflog %s' % (queue,donebase,ss)
	os.system(bsub_cmd)
Example #16
0
def map_by_bwa(reads,
               reference,
               mapped,
               bwa_args='',
               make_index=True,
               force_index=False):
    if make_index: make_bwa_ref_idx(reference, force_index=force_index)
    if reads.endswith('.bam'):
        cmd = 'samtools bam2fq %s | bwa mem %s %s - | samtools view -bS - > %s.bam' % (
            reads, bwa_args, reference, mapped)
    else:
        cmd = 'bwa mem %s %s %s | samtools view -bS - > %s.bam' % (
            bwa_args, reference, reads, mapped)

    ss = run_safe.safe_script(cmd, mapped, force_write=True)
    ret = os.system(ss)
    if ret == 0 and os.path.exists(mapped + '.bam'):
        print >> sys.stderr, '%s.bam created' % mapped
    else:
        errstr = 'mapping %s to %s failed' % (reads, reference)
        raise OSError, errstr

    return mapped + '.bam'
def submit_one(cfg, default_start, \
               nframes,nparts,nstep,param_queue,seglen, \
               ground_improvement,ground_suppress,\
               outline_engine, \
               thresh_coeff=None, \
               run_local=False,**kwargs):

    vid = vid_from_cfg(cfg)
    donebase,vs = donebase_from_param(vid, \
                        nframes,nparts,nstep,seglen, \
                        ground_improvement,ground_suppress,\
                        outline_engine, \
                        thresh_coeff=thresh_coeff,return_vs=True)
    attempts_dir = donebase+'-ATTEMPTS'

    if thresh_coeff:
        cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -tc %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,thresh_coeff,ground_improvement,ground_suppress,outline_engine,cfg,vs, MAX_ITERTIME, vid)
    else:
        cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s --max_itertime %s %s' % (seglen,default_start,nframes,nparts,nstep,param_queue,ground_improvement,ground_suppress,outline_engine,cfg,vs,MAX_ITERTIME, vid)
    
    logfile = donebase+'.lsflog'
    ss = run_safe.safe_script(cmd,donebase,force_write=True)
    subcmd = 'bsub -q %s -o %s %s' % (QUEUE,logfile,ss)
    #print >> sys.stderr, '\n\t',subcmd
    if run_local:
        ret = os.system(ss)
    else:
        ret = os.system(subcmd)
    if ret == 0:
        if not os.path.exists(attempts_dir): os.makedirs(attempts_dir)
        at_ret = os.system('touch %s' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S')))
        if ret != 0:
            print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(attempts_dir,'attempt'+time.strftime('%Y%m%d-%H%M%S'))
    else:
        errstr = 'submission of job failed:\n%s' % subcmd
        raise OSError, errstr
def submit_runs(vidroot,default_start, \
                nframes,nparts,nstep,param_queue,seglen, \
                ground_improvement,ground_suppress, \
                outline_engine, \
                num_jobs_running_max,num_jobs_new_max,skip_fn=skip_fn):
    num_current = get_current_run_count()
    num_new = num_jobs_running_max - num_current
    if num_new < 1:
        print >> sys.stderr, 'number of currently running jobs (%s) meets or exceeds max concurrent (%s)' % (
            num_current, num_jobs_running_max)
        return None
    else:
        launched = 0
        for cfg in sorted(glob(os.path.join(vidroot, '*/*-config.dict'))):
            if launched == num_new or launched >= num_jobs_new_max: break
            currjobs = Popen('bjobs -w', shell=True, stdout=PIPE).stdout.read()
            print >> sys.stderr, cfg, '\t',
            vid = cfg.split('-config')[0] + '.mp4'
            if not os.path.exists(vid):
                print >> sys.stderr, 'video removed; skipping'
                continue
            if os.path.exists(skip_fn(vid)):
                print >> sys.stderr, 'skip flag found; skipping'
                continue
            if vidtools.vid_duration(
                    vid) < MIN_VID_DUR:  #only analyze videos longer than 8hrs
                print >> sys.stderr, 'too short; skip'
                continue
            donebase = '%s-l%snp%snf%sns%sgi%sgs%soe%s' % (
                vid[:-4], seglen, nparts, nframes, nstep, ground_improvement,
                ground_suppress, outline_engine)
            vs = 'np%snf%sns%sgi%sgs%soe%s' % (nparts, nframes, nstep,
                                               ground_improvement,
                                               ground_suppress, outline_engine)
            attempts_dir = donebase + '-ATTEMPTS'
            if os.path.exists(donebase + '.done'):
                print >> sys.stderr, 'done'
            elif donebase in currjobs:
                print >> sys.stderr, 'running'
            elif os.path.exists(attempts_dir) and len(
                    glob(os.path.join(attempts_dir, 'attempt*'))) >= MAX_RETRY:
                nrrc = next_rerun_condition(cfg,RERUN_COEFFS, \
                                            nframes,nparts,nstep,seglen, \
                                            ground_improvement,ground_suppress, \
                                            outline_engine,return_state=True)
                if nrrc is None:
                    print >> sys.stderr, 'too many attempts (%s) for all conditions (%s); see %s' % (
                        len(glob(os.path.join(attempts_dir, 'attempt*'))),
                        RERUN_COEFFS, attempts_dir)
                else:
                    thresh_coeff, state = nrrc
                    print >> sys.stderr, 'rerun %s %s' % (thresh_coeff, state)
            else:
                cmd = 'summarize_segment_opencv.py -l %s -s %s -nf %s -np %s -ns %s -q %s -gi %s -gs %s -oe %s -ac %s -vs %s %s' % (
                    seglen, default_start, nframes, nparts, nstep, param_queue,
                    ground_improvement, ground_suppress, outline_engine, cfg,
                    vs, vid)
                logfile = donebase + '.lsflog'
                ss = run_safe.safe_script(cmd, donebase, force_write=True)
                subcmd = 'bsub -q %s -o %s %s' % (QUEUE, logfile, ss)
                #print >> sys.stderr, '\n\t',subcmd
                ret = os.system(subcmd)
                launched += 1
                if ret == 0:
                    if not os.path.exists(attempts_dir):
                        os.makedirs(attempts_dir)
                    at_ret = os.system('touch %s' % os.path.join(
                        attempts_dir,
                        'attempt' + time.strftime('%Y%m%d-%H%M%S')))
                    if ret != 0:
                        print >> sys.stderr, 'WRITING ATTEMPT FLAG TO %s FAILED' % os.path.join(
                            attempts_dir,
                            'attempt' + time.strftime('%Y%m%d-%H%M%S'))
                else:
                    errstr = 'submission of job failed:\n%s' % subcmd
                    raise OSError, errstr
                cmd_by_sam[sampart] = cmdstr

    if opts.debug:
        print 'COMMANDS FOLLOW:\n'+'\n'.join(cmds)
    else:
        #SLURM example
        logfile = os.path.join(outroot,'%slog' % opts.scheduler,'stampy-%s-%s-log' % (bp,tb))
        schedule_jobs(cmd_by_sam,opts.scheduler,'stampy',logfile,opts.lsf_queue,requeue=opts.fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(opts.gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>20000]"',MAX_RETRY=MAX_RETRY)

    #MERGE SAM PARTS FROM STAMPY
    cmds = []
    mergecmds_by_bam = {}
    for bam,sams in samparts_by_bam.items():
        cmd = 'merge_sams_with_validation.py %s %s' % (bam,' '.join(sams))
        cmds.append(cmd)
        mergecmds_by_bam[bam] = run_safe.safe_script(cmd,bam)

    if opts.debug:
        print 'COMMANDS FOLLOW:\n'+'\n'.join(cmds)
    else:
        #SLURM here
        logfile = os.path.join(outroot,'%slog' % opts.scheduler,'merge-%s-%s-log' % (bp,tb))
        schedule_jobs(mergecmds_by_bam,opts.scheduler,'stampy-merge',logfile,opts.lsf_queue,requeue=opts.fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(opts.gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>20000]"',MAX_RETRY=MAX_RETRY)
        #LSF.lsf_run_until_done(mergecmds_by_bam,logfile,opts.lsf_queue,'-R "select[mem>20000]"','stampy-merge',njobs,MAX_RETRY)

    if opts.cleanup:
        print >> sys.stderr, 'remove %s .sam part files' % len(cmd_by_sam)
        for i,f in enumerate(cmd_by_sam.keys()):
            os.unlink(f)
            os.unlink(f+'.done')
            print >> sys.stderr,'\r%s' % (i+1),
def realign_bams_lsf(bams,ref,outroot,njobs,min_ind_realign,queue='normal_serial',job_ram='20000',targetcreator_opts='--maxIntervalSize 5000',gatk_jar=gatk_jar,gatk_ram=8,force_links=False,MAX_RETRY=MAX_RETRY,fallback_queue=''):
    '''force_links replaces existing symlinks
    job_ram in MB
    gatk_ram in GB

    duration hardcoded to opts.max_job_duration; should be an argument
    '''
    realign_root = os.path.join(outroot,'realign')
    intervals_parts_root = os.path.join(realign_root,'parts')
    intervals_file = os.path.join(intervals_parts_root,'all_part.RealignTargets.intervals')
    try:
        os.makedirs(intervals_parts_root)
    except:
        pass


    os.chdir(realign_root)
    bams_to_link = {}
    link_to_realign = {}
    realign_to_srcroot = {}
    bams_to_workdir = {}
    realigned_bams = []
    
    for bam in bams:
        bamsrcroot,bambase = os.path.split(bam)
        fcroot = os.path.basename(bamsrcroot)
        bamlink = os.path.join(realign_root,fcroot,bambase)
        realigned = bamlink[:-4]+'.realigned.bam'
        realignedlink = bam[:-4]+'.realigned.bam'
        realignedidx = bamlink[:-4]+'.realigned.bai'
        realignedlinkidx = bam[:-4]+'.realigned.bai'
        
        realigned_bams.append(realignedlink)
        if force_links:
            try:
                os.unlink(realignedlink)
            except:
                pass
            try:
                os.unlink(realignedlinkidx)
            except:
                pass
        if not os.path.exists(realignedlink):
            bams_to_link[bam] = bamlink
            link_to_realign[bamlink] = realigned
            realign_to_srcroot[realigned] = realignedlink
            bams_to_workdir[bam] = os.path.dirname(bamlink)
            if not os.path.exists(bamlink):
                try:
                    os.makedirs(os.path.dirname(bamlink))
                except:
                    pass
                ret = os.system('ln -s %s %s' % (bam,bamlink))
                if ret != 0:
                    raise OSError, 'ln failed: %s -> %s' % (bam,bamlink)

    if len(bams_to_link) == 0: #nothing to do here; return links
        print >> sys.stderr, 'realigned bams all present'
        return realigned_bams

    #otherwise get down to business
    print >> sys.stderr, 'Perform realignment:'
    if os.path.exists(intervals_file+'.done'):
        print >> sys.stderr, 'using %s as intervals_file' % intervals_file
    else:
        bamstr = ' -I '.join(bams)
        intervals_parts_regions = partition_reference(ref,njobs)
        to_run_dict = {}
        for i,part in enumerate(intervals_parts_regions):
            reg_str = ' -L '.join(part)
            intervals_parts_file = os.path.join(intervals_parts_root,'part%s.RealignTargets.intervals' % (i))
            cmd = 'java -Xmx%sg -jar %s -T RealignerTargetCreator -I %s -R %s -L %s %s -o %s' % (gatk_ram,gatk_jar,bamstr,ref,reg_str,targetcreator_opts,intervals_parts_file)
            to_run_dict[intervals_parts_file] = run_safe.safe_script(cmd,intervals_parts_file)

        #SLURM HERE
        logfile = os.path.join(intervals_parts_root,'logs','RealignerTargetCreator')
        schedule_jobs(to_run_dict,opts.scheduler,'targetcreator',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY)
        #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'targetcreator',njobs,MAX_RETRY)
        #if fallback_queue:
        #    LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'targetcreator',njobs,MAX_RETRY)
        
        catcmd = 'cat %s > %s' % (' '.join(to_run_dict.keys()),intervals_file)
        ret = os.system(run_safe.safe_script(catcmd,intervals_file))
        if ret != 0:
            raise OSError, 'cat failed: %s' % catcmd

            
    samples_per_batch = max(min_ind_realign, len(bams_to_link)/njobs)
    bam_batches_by_dir = []
    this_batch = []
    lastroot = None
    for bam in sorted(bams_to_link.keys()):
        if lastroot != bams_to_workdir[bam] or len(this_batch) == samples_per_batch:
            if this_batch:
                bam_batches_by_dir.append((lastroot,this_batch))
            this_batch = []
            lastroot = bams_to_workdir[bam]
        this_batch.append(bam)
    if this_batch: #process last batch
        bam_batches_by_dir.append((lastroot,this_batch))

    if len(bam_batches_by_dir) == 0:
        print >> sys.stderr, 'realignments present'
    else:
        print >> sys.stderr, 'REALIGNMENT BATCH SUMMARY:'
        for workdir,bam_batch in bam_batches_by_dir:
            print >> sys.stderr, '\tbams: %s working: %s' % (len(bam_batch),workdir)
            #for bam in bam_batch:
            #    print >> sys.stderr, '\t\t%s' % bam

        to_run_dict = {}
        for i,(workdir,bam_batch) in enumerate(bam_batches_by_dir):
            bamstr = ' -I '.join(bam_batch)
            donefile = os.path.join(realign_root,'realign_batch%sof%s' % (i,len(bam_batches_by_dir)))
            cmd = 'cd %s; java -Xmx%sg -jar %s -T IndelRealigner -model USE_SW -I %s -R %s --targetIntervals %s -nWayOut .realigned.bam' % (workdir,gatk_ram,gatk_jar,bamstr,ref,intervals_file)
            to_run_dict[donefile] = run_safe.safe_script(cmd,donefile)

        #SLURM here
        logfile = os.path.join(realign_root,'logs','IndelRealigner')
        schedule_jobs(to_run_dict,opts.scheduler,'realigner',logfile,queue,requeue=fallback_queue,njobs=njobs,duration=opts.max_job_duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY)
        #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'realigner',njobs,MAX_RETRY)
        #if fallback_queue:
        #    LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'realigner',njobs,MAX_RETRY)

    for realigned,realignedlink in realign_to_srcroot.items():
        if not os.path.exists(realignedlink):
            ret = os.system('ln -s %s %s' % (realigned,realignedlink))
            if ret != 0:
                raise OSError, 'ln failed: %s -> %s' % (realigned,realignedlink)
        if not os.path.exists(realignedlink[:-1]+'i'):
            ret = os.system('ln -s %s %s' % (realigned[:-1]+'i',realignedlink[:-1]+'i'))
            if ret != 0:
                raise OSError, 'ln failed: %s -> %s' % (realigned[:-1]+'i',realignedlink[:-1]+'i')

    return realigned_bams
def call_variants_gatk_lsf(bams,ref,outroot,vcfbase,njobs=100,gatk_program='UnifiedGenotyper',gatk_args='-out_mode EMIT_ALL_CONFIDENT_SITES -dcov 200 -glm BOTH',gatk_jar=gatk_jar,gatk_ram=4,tmpdir=None,queue='normal_serial',job_ram='30000',MAX_RETRY=MAX_RETRY,include_regions=None,compress_vcf=True,fallback_queue='',scheduler=None,duration=None):

    if duration is None:
        try:
            duration = opts.max_job_duration
        except:
            duration = DURATION_DFAULT
            
    if scheduler is None:
        scheduler = 'slurm'
    if tmpdir is None:
        tmpdir = os.path.join(outroot,'gatk_tmp')
    bamstr = ' -I '.join(bams)
    regions = partition_reference(ref,njobs,include_regions)
    vcfbasename = vcfbase.endswith('.vcf') and vcfbase[:-4] or vcfbase
    gatkoutvcfbase = '%s-GATK-%s' % (vcfbasename,gatk_program)
    if compress_vcf:
        vcfext = '.vcf.gz'
    else:
        vcfext = '.vcf'
        
    gatkoutvcf = os.path.join(outroot,gatkoutvcfbase+vcfext)
    vcf_parts_root = os.path.join(outroot,gatkoutvcfbase+'-vcf_parts')
    try:
        os.makedirs(vcf_parts_root)
    except:
        pass

    logfile = os.path.join(vcf_parts_root,'logs',gatk_program)
    ser_to_run_dict = {}
    if scheduler == 'slurm':
        par_to_run_dict = {}

    print >> sys.stderr, 'Calculate %s runs: ' % gatk_program
    for i,reg in enumerate(regions):
        print >> sys.stderr, '\r\t%s / %s' % (i+1,len(regions)),
        reg = [r for r in reg if not r.split(':')[0] in skip_contigs]
        if len(reg) == 0:
            continue
        start,end = start_end_strs(reg)
        regstr = ' -L '.join(reg)
        partvcf = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s%s' % (gatkoutvcfbase,i,len(regions),start,end,vcfext))
        part_sh = os.path.join(vcf_parts_root,'%s_%dof%d_%sto%s.sh' % (gatkoutvcfbase,i,len(regions),start,end))
        cmd = 'java -Xmx%sg -Djava.io.tmpdir=%s -jar  %s -R %s -T %s -o %s %s -I %s -L %s' % (gatk_ram,tmpdir,gatk_jar,ref,gatk_program,partvcf,gatk_args,bamstr,regstr)
        #open(part_sh,'w').write('#!/usr/bin/env bash\n'+cmd+'\n')
        #os.system('chmod +x %s' % part_sh)
        if scheduler == 'slurm':
            nprevsub = len(SLURM.previous_submissions(logfile,partvcf+'.sh'))
            if nprevsub < MAX_RETRY:
                ser_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True)
            else:
                duration=MAX_DURATION
                print >> sys.stderr, '\n%s failed %s previous runs; %s thread X %s core invoked' % (partvcf,nprevsub,GATK_PAR_NT,GATK_PAR_NCT)
                cmd += ' -nt %s -nct %s' % (GATK_PAR_NT,GATK_PAR_NCT)
                par_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True)
        else:
            ser_to_run_dict[partvcf] = run_safe.safe_script(cmd,partvcf,force_write=True)

    #SLURM here
    #SERIAL (one core) RUNS
    schedule_jobs(ser_to_run_dict,scheduler,gatk_program,logfile,queue,requeue=fallback_queue,njobs=njobs,duration=duration,mem=(gatk_ram*1024)+JOB_MEM_OVERHEAD,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY)
    trd_keys = ser_to_run_dict.keys()
    #PARALLEL (multithread) RUNS
    if scheduler == 'slurm':
        mt_cores = GATK_PAR_NT*GATK_PAR_NCT
        mt_ram = ( (GATK_PAR_NT*gatk_ram*1024)+(JOB_MEM_OVERHEAD*GATK_PAR_NT) ) / float(mt_cores)
        mt_ram = int(mt_ram)
        print >> sys.stderr, '\nrun multithreaded %s: %s jobs; ram-per-core: %s cores: %s' % (gatk_program,len(par_to_run_dict),mt_ram,mt_cores)
        schedule_jobs(par_to_run_dict,scheduler,gatk_program,logfile,queue,requeue=fallback_queue,njobs=njobs,duration=duration,mem=mt_ram,flags='-R "select[mem>%s]"' % job_ram,MAX_RETRY=MAX_RETRY,slurm_cores=mt_cores)
        trd_keys.extend(par_to_run_dict.keys())
    
    #LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'gatk',njobs,MAX_RETRY)
    #if fallback_queue:
    #    LSF.lsf_run_until_done(to_run_dict,logfile,fallback_queue,'-R "select[mem>%s]"' % job_ram, 'gatk',njobs,MAX_RETRY)

    cmd = merge_vcf_parts_cmd(trd_keys,ref,gatkoutvcf,gatk_jar,gatk_ram,tmpdir)
    ret = os.system(run_safe.safe_script(cmd,gatkoutvcf))
    if ret != 0:
        raise OSError, 'VCF merge failed:\n%s' % cmd
Example #22
0
            prev_fq = '%s.fq%s-%s%s' % (fqbase,lnum,baseQ,fqext)
            print >> sys.stderr, 'must be 4-line, base 33 fastq to proceed; convert\nnew file will be %s\noriginal kept as %s\n' % (fq,prev_fq)
            save_previous_and_covert(prev_fq,fq)

    adapterstype = get_adapterstype(opts.flowcell,opts.lane,opts.index)
    adaptseq = get_adaptseq()
    adaptA,adaptB = adaptseq[adapterstype]['r1'],adaptseq[adapterstype]['r2']
    print >> sys.stderr, 'use adapterstype: %s\nadaptA: %s\nadaptB: %s' % (adapterstype,adaptA,adaptB)

    #run seqprep
    if opts.seqprep_base:
        sp_base = opts.seqprep_base
    else:
        sp_base = 'Sample_lane%s_%s' % (opts.lane, opts.index and opts.index or 'noidx')
    sp_fullbase = os.path.join(os.path.dirname(opts.infiles[0]),sp_base)

    merge,trim1,trim2 = overlap_by_seqprep(opts.infiles[0],opts.infiles[1],sp_fullbase,pct_id=opts.percent_id,min_ol=opts.overlap_length,adaptA=adaptA,adaptB=adaptB)

    cmd = 'preprocess_radtag_lane.py -iq 33 -suf merge %s -fc %s -l %s %s %s' % (opts.preprocess_argstr,opts.flowcell,opts.lane,(opts.index and '-idx %s' % opts.index or ''),merge)
    print >> sys.stderr, cmd
    ss = run_safe.safe_script(cmd,sp_fullbase+'-preprocess_merge',force_write=True)
    ret = os.system(ss)
    if ret != 0 or not os.path.exists(sp_fullbase+'-preprocess_merge.done'):
        raise OSError, 'merge preprocess failed'
    cmd = 'preprocess_radtag_lane.py -iq 33 -suf trim %s -fc %s -l %s %s %s %s' % (opts.preprocess_argstr,opts.flowcell,opts.lane,(opts.index and '-idx %s' % opts.index or ''),trim1,trim2)
    print >> sys.stderr, cmd
    ss = run_safe.safe_script(cmd,sp_fullbase+'-preprocess_trim',force_write=True)
    ret = os.system(ss)
    if ret != 0 or not os.path.exists(sp_fullbase+'-preprocess_merge.done'):
        raise OSError, 'trim preprocess failed'
Example #23
0
	outvid = '%s_%s_%s-%s%s' % (outbase,clab,offset,dur,outext)
	if os.path.exists(outvid) and not os.path.getsize(outvid) == 0 and ( vidtools.vid_duration(outvid) == dur ):
		print >> sys.stderr, '%s present and expected size, skip' % outvid
	else:
		if FORCE_PAR:
			h,w = vidtools.extract_keyframe(vid).shape
			th = h - (crops[1]+crops[3])
			tw = w - (crops[0]+crops[2])
			pixw = 255
			pixh = int((float(th)/tw)*pixw)
			parstr = '-aspect %s:%s' % (pixw,pixh)
		else:
			parstr = ''
		cropstr = '-vf crop=in_w-%s:in_h-%s:%s:%s' % (crops[0]+crops[2],crops[1]+crops[3],crops[0],crops[1])
		cmd = 'ffmpeg -ss %s -t %s -i %s -y %s -r 29.97 -b 20000k %s %s' % (offset,dur,vid,cropstr,parstr,outvid)
		to_run_dict[outvid] = run_safe.safe_script(cmd,outvid,force_write=True)

logfile = os.path.join(os.path.dirname(vid),'logs','crop-log')
LSF.lsf_run_until_done(to_run_dict,logfile,queue,'-R "select[mem>%s]"' % job_ram, 'crop-ffmpeg',10, MAX_RETRY)

#cmds = []
#rerun = True
#while rerun:
#	for clab,crops in cropsdict.items():
#		outbase,outext = os.path.splitext(vid)
#		outvid = '%s_%s_%s-%s%s' % (outbase,clab,offset,dur,outext)
#		if os.path.exists(outvid) and ( vidtools.vid_duration(outvid) == dur ):
#			print >> sys.stderr, '%s present and expected size, skip' % outvid
#		else:
#			cropstr = '-vf crop=in_w-%s:in_h-%s:%s:%s' % (crops[0]+crops[2],crops[1]+crops[3],crops[0],crops[1])
#			cmd = 'ffmpeg -ss %s -t %s -i %s -y %s -b 20000k %s' % (offset,dur,vid,cropstr,outvid)
intervals_file = bam_base + '.RealignTargets.intervals'
realigned_bam = bam_base + '.realigned.bam'
reduced_bam = bam_base + '.realigned.reduced.bam'

intervals_parts_regions = partition_reference(ref,njobs)
intervals_parts = []

for i,part in enumerate(intervals_parts_regions):
    #reg_str = ' -L '.join(part)
    reg_parts_file = os.path.join(workdir,'part%s.intervals' % (i))
    open(reg_parts_file,'w').writelines([p+'\n' for p in part if not p.split(':')[0] in skip_contigs])
    
    intervals_parts_file = os.path.join(workdir,'part%s.RealignTargets.intervals' % (i))
    rtc_part_cmd = 'java -Xmx%sg -jar %s -T RealignerTargetCreator -I %s -R %s -L %s %s -o %s' % \
                   (gatk_ram,gatk_jar,bam,ref,reg_parts_file,targetcreator_opts,intervals_parts_file)
    rtc_ss = run_safe.safe_script(rtc_part_cmd,intervals_parts_file,force_write=True)
    ret = os.system(rtc_ss)
    if ret == 0 and os.path.exists(intervals_parts_file):
        print >> sys.stderr, '%s / %s complete' % (i+1,len(intervals_parts_regions))
    else:
        errstr = 'failed on %s' % intervals_parts_file
        raise OSError,errstr
    intervals_parts.append(intervals_parts_file)


cat_cmd = 'cat %s > %s' % (' '.join(intervals_parts),intervals_file)
cat_ss = run_safe.safe_script(cat_cmd,intervals_file,force_write=True)
ret = os.system(cat_ss)
if ret == 0:
    print >> sys.stderr, 'intervals parts concatenation finished'
else:
    jobname_base = 'preprocess'
    logbase = os.path.join(opts.outroot,'slurmlog','preprocess')
    print >> sys.stderr, 'run %s logs in %s' % (jobname_base,logbase)
    SLURM.run_until_done(to_run_dict,jobname_base,logbase,opts.max_job_duration,(opts.job_ram+1)*1024,opts.num_batches,opts.queue,MAX_RETRY=MAX_RETRY)

    #collect individual fastq/fastq pairs
    #(LATER: GET INDIVIDUAL FILES BY DB LOOKUP; REQUIRES HANDLING MOUSE DB ID LOOKUP IF SET)
    fq_to_run = sample_fq_from_expected(expected_fq_d)
    #print fq_to_run

    map_reads_cmd = map_reads_exec + ' -gr %s -n %s -q %s -sched %s -mjd %s -v %s -s \'"%s"\' -g \'"%s"\' -gh \'"%s"\' -mp \'"%s"\' %s %s %s ' % \
                    (opts.job_ram, \
                     opts.num_batches, \
                     opts.queue, \
                     opts.scheduler, \
                     opts.max_job_duration, \
                     vcfname, \
                     opts.stampy_argstr, \
                     opts.gatk_argstr, \
                     opts.gatkhaplo_argstr, \
                     opts.mpileup_argstr, \
                     opts.mapreads_argstr, \
                     opts.reference_fasta, \
                     opts.outroot)

    map_reads_cmd += ' '.join(fq_to_run)
    
    print >> sys.stderr, 'run map_reads in %s' % (os.path.join(opts.outroot,vcfname))
    map_reads_ss = run_safe.safe_script(map_reads_cmd,os.path.join(opts.outroot,vcfname),force_write=True)
    ret = os.system(map_reads_ss)
picardRAM = 2
max_temp = 1000
max_records = 1500000
MAX_PER_RUN = 100
RM_SAMS = False #always overridden to True for sam/bams created as intermediates in large merge sets

#revolting hack
if len(sams) > MAX_PER_RUN:
    sams1 = sams[:len(sams)/2]
    sams2 = sams[len(sams)/2:]
    bam1 = bam+'-1.bam'
    bam2 = bam+'-2.bam'
    for b,s in [(bam1,sams1),(bam2,sams2)]:
        print >> sys.stderr, '\npre-merge %s (%s parts)\n' % (b,len(s))
        cmd = 'merge_sams_with_validation.py %s %s' % (b,' '.join(s))
        ss = run_safe.safe_script(cmd,b,force_write=True)
        print >> sys.stderr, ss
        ret = os.system(ss)
        if ret != 0:
            raise OSError, 'pre-merge failed'
    sams = [bam1,bam2]
    RM_SAMS = True
    
mergecmd = 'java -Xmx%sg -jar %sMergeSamFiles.jar INPUT=%s OUTPUT=%s MERGE_SEQUENCE_DICTIONARIES=true VALIDATION_STRINGENCY=LENIENT; samtools index %s' % (picardRAM,picard_root,' INPUT='.join(sams), bam, bam)
ret = os.system(mergecmd)
if ret == 0:
    print >> sys.stderr, '\nmerge complete\n'
else:
    print >> sys.stderr, '\nfailed:\n',mergecmd
    raise OSError, 'merge failed for %s' % bam
Example #27
0


# SUBMIT RUNS
import os,sys,re
from subprocess import Popen, PIPE
from glob import glob

seglen = 1800
q = 'unrestricted_serial'

for cfg in sorted(glob('*/*-config.dict')):
    currjobs = Popen('bjobs -w',shell=True,stdout=PIPE).stdout.read()
    print >> sys.stderr, cfg,'\t',
    vid = cfg.split('-config')[0]+'.mp4'
    if vidtools.vid_duration(vid) < 8*60*60: #only analyze videos longer than 8hrs
        print >> sys.stderr, 'too short; skip'
        continue
    donebase = '%s-l%snp60nf300ns4' % (vid[:-4],seglen)
    if os.path.exists(donebase+'.done'):
        print >> sys.stderr, 'done'
    elif donebase in currjobs:
        print >> sys.stderr, 'running'
    else:
        cmd = 'summarize_segment_opencv.py -l %s -s 60 -nf 300 -np 60 -ns 4 -gi 0.03 -oe shapely -ac %s -vs np60nf300ns4shapely %s' % (seglen,cfg,vid)
        logfile = donebase+'.lsflog'
        ss = run_safe.safe_script(cmd,donebase,force_write=True)
        subcmd = 'bsub -q %s -o %s %s' % (q,logfile,ss)
        ret = os.system(subcmd)

Example #28
0
    sp_fullbase = os.path.join(os.path.dirname(opts.infiles[0]), sp_base)

    merge, trim1, trim2 = overlap_by_seqprep(opts.infiles[0],
                                             opts.infiles[1],
                                             sp_fullbase,
                                             pct_id=opts.percent_id,
                                             min_ol=opts.overlap_length,
                                             adaptA=adaptA,
                                             adaptB=adaptB)

    cmd = 'preprocess_radtag_lane.py -iq 33 -suf merge %s -fc %s -l %s %s %s' % (
        opts.preprocess_argstr, opts.flowcell, opts.lane,
        (opts.index and '-idx %s' % opts.index or ''), merge)
    print >> sys.stderr, cmd
    ss = run_safe.safe_script(cmd,
                              sp_fullbase + '-preprocess_merge',
                              force_write=True)
    ret = os.system(ss)
    if ret != 0 or not os.path.exists(sp_fullbase + '-preprocess_merge.done'):
        raise OSError, 'merge preprocess failed'
    cmd = 'preprocess_radtag_lane.py -iq 33 -suf trim %s -fc %s -l %s %s %s %s' % (
        opts.preprocess_argstr, opts.flowcell, opts.lane,
        (opts.index and '-idx %s' % opts.index or ''), trim1, trim2)
    print >> sys.stderr, cmd
    ss = run_safe.safe_script(cmd,
                              sp_fullbase + '-preprocess_trim',
                              force_write=True)
    ret = os.system(ss)
    if ret != 0 or not os.path.exists(sp_fullbase + '-preprocess_merge.done'):
        raise OSError, 'trim preprocess failed'