def make_pooldirs(data, parentdir): # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: DIR = op.join(parentdir, p) if op.exists(DIR): print("The pooldir already exists, this could overwrite previous data: %s" % DIR) askforinput() pooldirs.append(makedir(DIR)) makedir(op.join(DIR, 'shfiles')) return pooldirs
def make_pooldirs(data, parentdir): """Create subdirectories of parentdir. Positional arguments: data - datatable.txt with info for pipeline parentdir - directory with datatable.txt and (symlinks to) fastq data """ # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: pooldir = op.join(parentdir, p) if op.exists(pooldir): text = "\tWARN: The pooldir already exists, this WILL overwrite and/or delete previous data: %s" % pooldir print(Bcolors.WARNING + text + Bcolors.ENDC) askforinput(tab='\t', newline='') # first unlink fastq files for f in fs(pooldir): if f.endswith('.gz'): os.unlink(f) # then just delete the directory shutil.rmtree(pooldir) pooldirs.append(makedir(pooldir)) return pooldirs
def get_prereqs(bedfile, pooldir, parentdir, pool, program): """Get object names.""" num = bedfile.split("_")[-1].split(".bed")[0] ref = pklload(op.join(parentdir, 'poolref.pkl'))[pool] outdir = makedir(op.join(pooldir, program)) vcf = op.join(outdir, f'{pool}_{program}_bedfile_{num}.vcf') return (num, ref, vcf)
def get_prereqs(num): """Create a name for a bedfile based on the ref.fa path name and num. Positional arguments: num - int; the num'th bedfile """ bname = op.basename(ref).split(".fa")[0] beddir = makedir(op.join(op.dirname(ref), 'bedfiles_%s' % bname)) f = op.join(beddir, "%s_bedfile_%s.bed" % (bname, str(num).zfill(4))) return f
def get_varscan_cmd(bamfiles, bedfile, bednum, vcf, ref, pooldir, program): """Create command to call varscan.""" smallbams, smallcmds = get_small_bam_cmds(bamfiles, bednum, bedfile) smallbams = ' '.join(smallbams) ploidy = pklload(op.join(parentdir, 'ploidy.pkl'))[pool] # if single-sample then set minfreq to 0, else use min possible allele freq minfreq = 1 / sum(ploidy.values()) if len(ploidy.keys()) > 1 else 0 cmd = f'''samtools mpileup -B -f {ref} {smallbams} | java -Xmx15g -jar \ $VARSCAN_DIR/VarScan.v2.4.3.jar mpileup2cns --min-coverage 8 --p-value 0.05 \ --min-var-freq {minfreq} --strand-filter 1 --min-freq-for-hom 0.80 \ --min-avg-qual 20 --output-vcf 1 > {vcf} module unload samtools ''' # final vcf outdir = makedir(op.join(pooldir, program)) finalvcf = op.join(outdir, op.basename(vcf)) cmds = smallcmds + cmd return (cmds, finalvcf)
def create_reservation(pooldir, exitneeded=False): """Create a file so that other realign jobs can't start crisp and varscan too.""" print('creating reservation') shdir = makedir(op.join(pooldir, 'shfiles/crispANDvarscan')) file = op.join(shdir, '%s_crispANDvarscan_reservation.sh' % pool) jobid = os.environ['SLURM_JOB_ID'] if not op.exists(file): with open(file, 'w') as o: o.write("%s" % jobid) else: exitneeded = True time.sleep(random.random()*15) with open(file, 'r') as o: fjobid = o.read().split()[0] if not fjobid == jobid or exitneeded is True: # just in case two jobs try at nearly the same time print('another job has already created crispANDvarscan_reservation.sh for %s' % pool) exit() return shdir
def make_pooldirs(data, parentdir): """Create subdirectories of parentdir. Positional arguments: data - datatable.txt with info for pipeline parentdir - directory with datatable.txt and (symlinks to) fastq data """ # make pool dirs print(Bcolors.BOLD + "\nmaking pool dirs" + Bcolors.ENDC) pools = uni(data['pool_name'].tolist()) pooldirs = [] for p in pools: DIR = op.join(parentdir, p) if op.exists(DIR): print( "The pooldir already exists, this could overwrite previous data: %s" % DIR) print("Do you want to proceed?") askforinput() pooldirs.append(makedir(DIR)) return pooldirs
# get more dup stats module load samtools/1.9 samtools flagstat {dupfile} > {dupflag} module unload samtools # call next step source $HOME/.bashrc export PYTHONPATH="${{PYTHONPATH}}:$HOME/gatk_pipeline" export SQUEUE_FORMAT="%.8i %.8u %.12a %.68j %.3t %16S %.10L %.5D %.4C %.6b %.7m %N (%r)" python $HOME/gatk_pipeline/04_scatter-gvcf.py {dupfile} {pooldir} {samp} """ # create shdir and file shdir = op.join(pooldir, 'shfiles/03_mark_build_shfiles') for d in [shdir, dupdir]: makedir(d) file = op.join(shdir, '%(pool)s-%(samp)s-mark.sh' % locals()) with open(file, 'w') as o: o.write("%s" % text) # sbatch file os.chdir(shdir) print('shdir = ', shdir) subprocess.call([shutil.which('sbatch'), file]) # balance queue balance_queue.main('balance_queue.py', 'mark') balance_queue.main('balance_queue.py', 'bwa')
module load gatk/3.8 module load java export _JAVA_OPTIONS="-Xms256m -Xmx7g" java -Djava.io.tmpdir=$SLURM_TMPDIR -jar $EBROOTGATK/GenomeAnalysisTK.jar \ -T IndelRealigner -R %(ref)s -I %(dupfile)s -targetIntervals %(listfile)s -o %(realbam)s module unload gatk # sbatch CRISP job if all pooled bamfiles have been created source $HOME/.bashrc export PYTHONPATH="${PYTHONPATH}:$HOME/pipeline" export SQUEUE_FORMAT="%%.8i %%.8u %%.12a %%.68j %%.3t %%16S %%.10L %%.5D %%.4C %%.6b %%.7m %%N (%%r)" python $HOME/pipeline/start_crispANDvarscan.py %(parentdir)s %(pool)s python $HOME/pipeline/balance_queue.py bedfile ''' % locals() # create shdir and shfile shdir = op.join(pooldir, 'shfiles/05_indelRealign_shfiles') makedir(shdir) file = op.join(shdir, '%(pool)s-%(samp)s-indelRealign.sh' % locals()) with open(file, 'w') as o: o.write("%s" % text) os.chdir(shdir) print('shdir = ', shdir) subprocess.call([shutil.which('sbatch'), file]) balance_queue.main('balance_queue.py', 'indelRealign') balance_queue.main('balance_queue.py', 'realign')
def make_beddir(): """Create dir for bedfiles.""" bname = op.basename(ref).split(".fa")[0] beddir = makedir(op.join(op.dirname(ref), 'bedfiles_%s' % bname)) return bname, beddir