def _get_r_wrapper_lines(phase): assert phase == "cons" or phase == "trees", "The phase parameter can only be trees or cons" # Create the run command, which differs per phase if phase == "cons": run_cmd = merge_items([ "${CMD} ${DATASET} ${ITERS} ${BURNIN} ${DATPATH} ${PURITYFILE} \"", phase, "\" ${PARALLEL} ${THREADS} ${BINSIZE} \"1\" ${NBLOCKS}" ], sep="") else: run_cmd = merge_items([ "${CMD} ${DATASET} ${ITERS} ${BURNIN} ${DATPATH} ${PURITYFILE} \"", phase, "\" ${PARALLEL} ${THREADS} ${BINSIZE} $LSB_JOBINDEX" ], sep="") # Create the other lines lines = [ "#$LSB_JOBINDEX", "DATASET=$1", "ITERS=$2", "BURNIN=$3", "DATPATH=$4", "PURITYFILE=$5", "ANALYSIS=$6", "PARALLEL=$7", "THREADS=$8", "BINSIZE=$9", "NBLOCKS=${10}", merge_items(["CMD=", SCRIPT], sep=""), run_cmd ] return (lines)
def createConcatSplitFilesCmd(samplename, infile_list, outfile, haveHeader, run_dir): cmd = [ DPP_SCRIPT, "-c concatSplitFiles", "-s", samplename, "--files", merge_items(infile_list, sep=","), "-o", outfile, "-r", run_dir ] if haveHeader: cmd.append("--haveHeader") return (merge_items(cmd))
def createSplitLociCmd(samplename, loci_file, prefix, postfix, fai_file, ignore_file, run_dir): return (merge_items([ DPP_SCRIPT, "-c splitLociFile", "-s", samplename, "--loci", loci_file, "-f", fai_file, "-i", ignore_file, "--prefix", prefix, "--postfix", postfix, "-f", CHROMS_FAI, "-i", IGNORE_FILE, "-r", run_dir ]))
def generateQcrunScript(dp_master_file, dp_in_dir, qc_dir): """ Creates the qc runscript for easy qc generation. """ scriptfile = path.joinpath(qc_dir, "runQc.sh") outf = open(scriptfile, "w") outf.write(merge_items([SCRIPT, dp_master_file, dp_in_dir, qc_dir]) + "\n") outf.write("convert *alleleFreq*png alleleFrequency.pdf\n") outf.write("convert *copyNumberAdjustment*png copyNumberAdjustment.pdf\n") outf.write("convert *depth*png depth.pdf\n") outf.write("convert *kappa*png kappa.pdf\n") outf.write("convert *mutation.copy.num*png mutation.copy.number.pdf\n") outf.write("convert *totalCopy*png totalCopyNumber.pdf\n") outf.write("convert *_fractionOfCells*png fractionOfCells.pdf\n") outf.write( "convert *subclonalFractionPerChromosome*png subclonalFractionPerChromosome.pdf\n" ) outf.write( "convert *large.subclonal.fraction.by.chrom*png large.subclonal.fraction.by.chrom.pdf\n" ) outf.write( "convert *depth.vs.frac.mutCount.png depth.vs.frac.mutCount.pdf\n") outf.write( "convert *_cellularityCorrectedAF.png cellularityCorrectedAF.pdf\n") outf.close() # Make executable st = os.stat(scriptfile) os.chmod(scriptfile, st.st_mode | stat.S_IEXEC)
def createDpInputCmd(samplename, loci_file, allele_freq_file, subclone_file, rho_psi_file, mut_mut_phase_file, mut_cn_phase_file, gender, bb_dir, run_dir): return (merge_items([ DPP_SCRIPT, "-c dpInput", "-s", samplename, "--loci", loci_file, "--all_freq", allele_freq_file, "--subclones", subclone_file, "--rhopsi", rho_psi_file, "--mut_mut", mut_mut_phase_file, "--mut_cn", mut_cn_phase_file, "-x", gender, "-o", samplename + "_allDirichletProcessInfo.txt", "-b", bb_dir, "-r", run_dir ]))
def createGetAlleleFrequencyCmd(samplename, loci_file_prefix, bam_file, out_file_prefix, run_dir, split_chroms): if split_chroms: filename_suffix = "${LSB_JOBINDEX}.txt" else: filename_suffix = ".txt" return (merge_items([ DPP_SCRIPT, "-c getAlleleFrequency", "-s", samplename, "--bam", bam_file, "--loci", loci_file_prefix + filename_suffix, "-o", out_file_prefix + filename_suffix, "-r", run_dir ]))
def generateSamplesheet(samplename_file, tumour_file, tumourid_file, normal_file, normalid_file, gender_file, bb_file, variants_file, output_file): """ Takes various single column files and generates a samplesheet. The i'th row of each of these files will be joined together (as if the Unix command line paste was called). """ # Read in the various files samplenames = read_list_of_items(samplename_file) tumours = read_list_of_items(tumour_file) tumour_ids = read_list_of_items(tumourid_file) normals = read_list_of_items(normal_file) normal_ids = read_list_of_items(normalid_file) genders = read_list_of_items(gender_file) # BB is optional (could be ran after creation of this project. Set default placeholder if this is the case if not bb_file is None: bb = read_list_of_items(bb_file) else: bb = ['NA'] * len(samplenames) variants = read_list_of_items(variants_file) # Write the output, joining line i from all vectors together outf = open(output_file, 'w') outf.write( merge_items([ "#sample", "tumour_id", "tumour", "normal_id", "normal", "bb_dir", "gender", "variants" ], sep="\t") + "\n") for i in range(0, len(samplenames)): outf.write( merge_items([ samplenames[i], tumour_ids[i], tumours[i], normal_ids[i], normals[i], bb[i], genders[i], variants[i] ], sep="\t") + "\n") outf.close()
def createMutMutPhasingCmd(samplename, loci_file_prefix, out_file_prefix, bam_file, bai_file, max_distance, bb_dir, run_dir, split_chroms): if split_chroms: filename_suffix = "${LSB_JOBINDEX}.txt" else: filename_suffix = ".txt" return (merge_items([ DPP_SCRIPT, "-c mutMutPhasing", "-s", samplename, "--loci", loci_file_prefix + filename_suffix, "-o", out_file_prefix + filename_suffix, "--bam", bam_file, "--bai", bai_file, "--max_distance", str(max_distance), "-b", bb_dir, "-r", run_dir ]))
def createMutCnPhasingCmd(samplename, loci_file_prefix, baf_file, hap_info_prefix, hap_info_suffix, outfile_prefix, bam_file, bai_file, max_distance, bb_dir, run_dir, split_chroms): # Running this split per chromosome always, as the internal R function cannot handle all data at once because the impute output doesn't contain chromosome info #if split_chroms: filename_suffix = "${LSB_JOBINDEX}" #else: # filename_suffix = "" return (merge_items([ DPP_SCRIPT, "-c mutCNPhasing", "-s", samplename, "--loci", loci_file_prefix + filename_suffix + ".txt", "--phased_baf", baf_file, "--hap_info", hap_info_prefix + filename_suffix + hap_info_suffix, "-o", outfile_prefix + filename_suffix + ".txt", "--bam", bam_file, "--bai", bai_file, "-b", bb_dir, "-r", run_dir ]))
def generateBsubCmd(jobname, logdir, cmd, queue="normal", mem=1, depends=None, isArray=False, threads=None): ''' Transforms the cmd into a bsub command with the supplied parameters. ''' bcmd = merge_items(["bsub", "-q", queue, "-J \"" + jobname + "\""]) if isArray: bcmd = merge_items([ bcmd, "-o", path.joinpath(logdir, jobname) + ".%J.%I.out", "-e", path.joinpath(logdir, jobname + ".%J.%I.err") ]) else: bcmd = merge_items([ bcmd, "-o", path.joinpath(logdir, jobname) + ".%J.out", "-e", path.joinpath(logdir, jobname + ".%J.err") ]) mem = str(mem) + "000" bcmd = merge_items([ bcmd, "-M", mem, "-R", "'span[hosts=1] select[mem>" + mem + "] rusage[mem=" + mem + "]'" ]) if depends is not None: depends_str = map(lambda x: "done(" + x + ")", depends) depends_str = "&&".join(depends_str) bcmd = merge_items([bcmd, "-w\"" + depends_str + "\""]) if threads is not None: bcmd = merge_items([bcmd, "-n", str(threads)]) bcmd = merge_items([bcmd, "'" + cmd + "'"]) return (bcmd)
def createPlotHaplotypesCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "plothaplotypes"] return(merge_items(cmd))
def createCleanupPostBafCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "cleanuppostbaf"] return(merge_items(cmd))
def createCombineImputeCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "combineimpute"] return(merge_items(cmd))
def createImputeFromBafCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "imputefromaf"] return(merge_items(cmd))
def createBafLogCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "baflog"] return(merge_items(cmd))
def createAlleleCountCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), bb_conf.getThreadsOption_cgpBB(), "-p", "allelecount"] return(merge_items(cmd))
#!/usr/bin/env python import argparse, sys, os, stat from path import path from util import merge_items LIBPATH = "/nfs/users/nfs_s/sd11/repo/dirichlet/dp_combined/" SCRIPT = merge_items(["Rscript", LIBPATH + "/RunDP_pipeline.R"]) def _write_script(filename, lines): outf = open(filename, "w") for line in lines: outf.write(line + "\n") outf.close() # Make executable st = os.stat(filename) os.chmod(filename, st.st_mode | stat.S_IEXEC) def _get_r_wrapper_lines(phase): assert phase == "cons" or phase == "trees", "The phase parameter can only be trees or cons" # Create the run command, which differs per phase if phase == "cons": run_cmd = merge_items([ "${CMD} ${DATASET} ${ITERS} ${BURNIN} ${DATPATH} ${PURITYFILE} \"", phase, "\" ${PARALLEL} ${THREADS} ${BINSIZE} \"1\" ${NBLOCKS}" ], sep="") else:
def createSubclonesCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), "-p", "subclones"] return(merge_items(cmd))
def createCombineBafsCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), "-p", "combinebafs"] return(merge_items(cmd))
def generateDPrunScript(run_dir, dp_in_dir, dp_master_file, projectname): ''' This function will create a series of shell scripts that make life easier when running dirichlet clustering pipelines. within the directory that is supplied by the run_dir parameter this will be created: - submit.block.sh : for the block parallel tree based method, when run this creates two LSF jobs for the trees and cons phase respectively - submit.nd.sh : for running the nD clustering, when run this creates a single LSF job - resubmit.block.sh : for resubmitting the cons step of the tree based method, when run this creates a single LSF job for the cons phase These would ideally never be called from the command line and only through the other scripts: - R wrapper script RunBlockTreeDP_trees.sh - R wrapper script RunBlockTreeDP_cons.sh ''' ''' Write block parallel run scripts ''' # Write R wrappers first #lines = _get_r_wrapper_lines("trees") #_write_script(path.joinpath(run_dir, "RunBlockTreeDP_trees.sh"), lines) #lines = _get_r_wrapper_lines("cons") #_write_script(path.joinpath(run_dir, "RunBlockTreeDP_cons.sh"), lines) # Write wrapper around R wrappers ''' Example SAMPLE=$1 QUEUE="basement" JOBNAME="tree_pros" ACCEPTEDHOSTS="-m vr-2-3-10 vr-2-3-02 vr-2-3-05 vr-2-3-08 vr-2-3-15 vr-2-3-13" LIBPATH="~/repo/dirichlet/dp_combined/" PARAMS="${LIBPATH} ${SAMPLE} 200 30 /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/Data/ /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/prostate_mets.txt tree_dp true 5 0.05" NBLOCKS=10 MEMTREE=75000 MEMCONS=75000 bsub -M ${MEMTREE} -R "select[mem>${MEMTREE}] rusage[mem=${MEMTREE}] span[hosts=1]" -n 5 -J "${JOBNAME}_t[1-${NBLOCKS}]" -q "${QUEUE}" -o $PWD/logs/${JOBNAME}_t.%J.%I.out -e $PWD/logs/${JOBNAME}_t.%J.%I.err "${LIBPATH}/RunBlockTreeDP_trees.sh ${PARAMS}" bsub -w"ended(${JOBNAME}_t[1-${NBLOCKS}])" -M ${MEMCONS} -R "select[mem>${MEMCONS}] rusage[mem=${MEMCONS}]" -J "${JOBNAME}_c" -q "${QUEUE}" -o $PWD/logs/${JOBNAME}_c.%J.out -e $PWD/logs/${JOBNAME}_c.%J.err "${LIBPATH}/RunBlockTreeDP_cons.sh ${PARAMS} ${NBLOCKS}" ''' lines = [ "SAMPLE=$1", "QUEUE=\"basement\"", merge_items(["JOBNAME=\"tree_", projectname, "\""], sep=""), merge_items(["LIBPATH=\"", LIBPATH, "\""], sep=""), merge_items([ "PARAMS=\"${LIBPATH} ${SAMPLE} 200 30", dp_in_dir, dp_master_file, "tree_dp true 5 0.05\"" ]), "NBLOCKS=\"10\"", "MEMTREE=\"15000\"", "MEMCONS=\"15000\"", merge_items([ "bsub -M ${MEMTREE} -R \"select[mem>${MEMTREE}] rusage[mem=${MEMTREE}] span[hosts=1]\" -n 5 -J \"${JOBNAME}_t[1-${NBLOCKS}]\" -q \"${QUEUE}\" -o $PWD/logs/${JOBNAME}_t.%J.%I.out -e $PWD/logs/${JOBNAME}_t.%J.%I.err \"${LIBPATH}/RunBlockTreeDP_trees.sh ${PARAMS}\"" ]), merge_items([ "bsub -w\"ended(${JOBNAME}_t[1-${NBLOCKS}])\" -M ${MEMCONS} -R \"select[mem>${MEMCONS}] rusage[mem=${MEMCONS}]\" -J \"${JOBNAME}_c\" -q \"${QUEUE}\" -o $PWD/logs/${JOBNAME}_c.%J.out -e $PWD/logs/${JOBNAME}_c.%J.err \"${LIBPATH}/RunBlockTreeDP_cons.sh ${PARAMS} ${NBLOCKS}\"" ]) ] _write_script(path.joinpath(run_dir, "submit.block.sh"), lines) ''' Write resubmit block parallel run script Example: resubmit.prostate_mets.block.sh SAMPLE=$1 QUEUE="long" JOBNAME="tree_pros" ACCEPTEDHOSTS="-m vr-2-3-10 vr-2-3-02 vr-2-3-05 vr-2-3-08 vr-2-3-15 vr-2-3-13" LIBPATH="~/repo/dirichlet/dp_combined/" PARAMS="${LIBPATH} ${SAMPLE} 200 30 /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/Data/ /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/prostate_mets.txt tree_dp true 5 0.05" NBLOCKS=10 MEMCONS=75000 bsub -M ${MEMCONS} -R "select[mem>${MEMCONS}] rusage[mem=${MEMCONS}]" -J "${JOBNAME}_c" -q "${QUEUE}" -o $PWD/logs/${JOBNAME}_c.%J.out -e $PWD/logs/${JOBNAME}_c.%J.err "${LIBPATH}/RunBlockTreeDP_cons.sh ${PARAMS} ${NBLOCKS}" ''' lines = [ "SAMPLE=$1", "QUEUE=\"basement\"", merge_items(["JOBNAME=\"tree_", projectname, "\""], sep=""), merge_items(["LIBPATH=\"", LIBPATH, "\""], sep=""), merge_items([ "PARAMS=\"${LIBPATH} ${SAMPLE} 200 30", dp_in_dir, dp_master_file, "tree_dp true 5 0.05\"" ]), "NBLOCKS=\"10\"", "MEMCONS=\"15000\"", merge_items([ "bsub -w\"ended(${JOBNAME}_t[1-${NBLOCKS}])\" -M ${MEMCONS} -R \"select[mem>${MEMCONS}] rusage[mem=${MEMCONS}]\" -J \"${JOBNAME}_c\" -q \"${QUEUE}\" -o $PWD/logs/${JOBNAME}_c.%J.out -e $PWD/logs/${JOBNAME}_c.%J.err \"${LIBPATH}/RunBlockTreeDP_cons.sh ${PARAMS} ${NBLOCKS}\"" ]) ] _write_script(path.joinpath(run_dir, "resubmit.block.sh"), lines) ''' Write the nD run script Example: submit.prostate_mets.nd.sh QUEUE="normal" JOBNAME="nd_pros" # -m "vr-2-3-10 vr-2-3-02 vr-2-3-05 vr-2-3-08 vr-2-3-15 vr-2-3-13" CMD="Rscript ~/repo/dirichlet/dp_combined/RunDP_pipeline.R" PARAMS="1 1250 250 /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/Data/ /lustre/scratch110/sanger/sd11/dirichlet/prostate_mets/prostate_mets.txt nd_dp false 1 NA 1 1" MEMORY="17000" bsub -J ${JOBNAME} -q ${QUEUE} -M ${MEMORY} -R 'span[hosts=1] select[mem>'${MEMORY}'] rusage[mem='${MEMORY}']' -o $PWD/logs/${JOBNAME}.%J.out -e $PWD/logs/${JOBNAME}.%J.err "${CMD} ${PARAMS}" ''' lines = [ "SAMPLE=$1", "QUEUE=\"normal\"", merge_items(["JOBNAME=\"nd_", projectname, "\""], sep=""), merge_items(["CMD=\"", SCRIPT, "\""], sep=""), merge_items([ "PARAMS=\"${SAMPLE} 1250 250", dp_in_dir, dp_master_file, "nd_dp false 1 NA 1 1\"" ]), "MEMORY=\"15000\"", "bsub -J ${JOBNAME} -q ${QUEUE} -M ${MEMORY} -R 'span[hosts=1] select[mem>'${MEMORY}'] rusage[mem='${MEMORY}']' -o $PWD/logs/${JOBNAME}.%J.out -e $PWD/logs/${JOBNAME}.%J.err \"${CMD} ${PARAMS}\"" ] _write_script(path.joinpath(run_dir, "submit.nd.sh"), lines)
def createGenerateAFLociCmd(samplename, outfile_postfix, vcf_file, run_dir): return (merge_items([ DPP_SCRIPT, "-c generateAFLoci", "-s", samplename, "-o", samplename + outfile_postfix, "-v", vcf_file, "-f", CHROMS_FAI, "-i", IGNORE_FILE, "-r", run_dir ]))
def createDpIn2VcfCmd(vcf_file, dpIn_file, outfile, fai_file, ignore_file): return (merge_items([ DPPVCF_SCRIPT, "-v", vcf_file, "-i", dpIn_file, "-f", fai_file, "--ig", ignore_file, "-o", outfile ]))
def createSegmentPhasedCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), "-p", "segmentphased"] return(merge_items(cmd))
def createDumpCountsSangerCmd(samplename, vcf_file, run_dir): return (merge_items([ DPP_SCRIPT, "-c dumpCountsSanger", "-s", samplename, "-v", vcf_file, "-r", run_dir, "-o", samplename + "_alleleFrequency.txt" ]))
def createFinaliseCmd(bb_conf): cmd = [BBSCRIPT, bb_conf.getStandardOptions_cgpBB(), "-p", "finalise"] return(merge_items(cmd))
def generateAlleleCountCommand(bam, loci, outfile): # Generate: alleleCounter -b bam/PD7422a.bam -l /lustre/scratch110/sanger/sd11/Documents/GenomeFiles/battenberg_1000genomesloci2012_v3/1000genomesloci2012_chr22.txt -o battenberg/PD7422a/^C7422a_alleleFrequencies_chr22.txt return (merge_items([ALLELECOUNTER, "-b", bam, "-l", loci, "-o", outfile]))