def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None): """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one. :param str working_dir: directory where files will be created :param int batch: batch number, and incremental number specifing which batch lot are we processing :param list current_batch: list containing the samples to be combined :param bool scratch: if True works on scratch :param string interval: if not none specifies a file containing the interval(s) to be combined :returns: path to the sbatch file """ job_name = "CombineGVCFs_batch{}".format(batch) output_file = "{}_batch{}.g.vcf.gz".format(CONFIG["output_header"], batch) interval_name = "" if interval is not None: interval_name = os.path.basename(interval).split(".")[0] # store the interval name job_name = "CombineGVCFs_batch{}_{}".format(batch, interval_name) output_file = "{}_batch{}_{}.g.vcf.gz".format(CONFIG["output_header"], batch, interval_name) #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as CombineGVCFsFile: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) CombineGVCFsFile.write(slurm) CombineGVCFsFile.write("\n") #rsync to scratch all samples if scratch: CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory #now cycle over the samples, build the GATK command samples_string_input = "" for sample in current_batch: sample_path_dir = sample if scratch: CombineGVCFsFile.write("rsync -rptoDLv {} $SNIC_TMP/{}/\n".format(sample, job_name)) CombineGVCFsFile.write("rsync -rptoDLv {}.tbi $SNIC_TMP/{}/\n".format(sample, job_name)) sample_name = os.path.basename(sample) sample_path_dir = "$SNIC_TMP/{}/{}".format(job_name, sample_name) samples_string_input += "-V {} \\\n".format(sample_path_dir) GATK_command= "java -Xmx120g -jar {} -T CombineGVCFs \\\n".format(CONFIG["GATK"]) for option in CONFIG["walkers"]["CombineGVCFs"]: GATK_command += "{} \\\n".format(option) #attach the samples I am going to work with GATK_command += "{} ".format(samples_string_input) if interval is not None: GATK_command += "-L {} \\\n".format(interval) if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file) #once this is done rsync back to lupus GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file , working_dir) else: GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file) CombineGVCFsFile.write(GATK_command) #return path to sbach file return sbatch_file
def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None): """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one. :param str working_dir: directory where files will be created :param int batch: batch number, and incremental number specifing which batch lot are we processing :param list current_batch: list containing the samples to be combined :param bool scratch: if True works on scratch :param string interval: if not none specifies a file containing the interval(s) to be combined :returns: path to the sbatch file """ name_batch1 = os.path.basename([item for item in combined_gvcf_files if "batch1" in item][0]) interval_name = "" #there must be at least one batch so look for it, not elegant but works if name_batch1.split("batch1") != ".g.vcf.gz": interval_name = name_batch1.split("batch1")[1].split(".")[0] job_name = "GenotypeGVCFs{}".format(interval_name) output_file = "{}_joincalled{}.g.vcf.gz".format(CONFIG["output_header"], interval_name) #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as GenotypeGVCFs: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) GenotypeGVCFs.write(slurm) GenotypeGVCFs.write("\n") #rsync to scratch all samples if scratch: GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory #now cycle over the samples, build the GATK command combined_gvcf_string_input = "" for combined_gvcf in combined_gvcf_files: combined_gvcf_path_dir = combined_gvcf if scratch: GenotypeGVCFs.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(combined_gvcf, job_name)) combined_gvcf_name = os.path.basename(combined_gvcf) combined_gvcf_path_dir = "$SNIC_TMP/{}/{}".format(job_name, combined_gvcf_name) combined_gvcf_string_input += "-V {} \\\n".format(combined_gvcf_path_dir) GATK_command= "java -Xmx250g -jar {} -T GenotypeGVCFs \\\n".format(CONFIG["GATK"]) for option in CONFIG["walkers"]["GenotypeGVCFs"]: GATK_command += "{} \\\n".format(option) GATK_command += "{} ".format(combined_gvcf_string_input) if interval is not None: GATK_command += "-L {} \\\n".format(interval) if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file) #once this is done rsync back to lupus GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file , working_dir) else: GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file) GenotypeGVCFs.write(GATK_command) #return path to sbach file return sbatch_file
def build_CatVariants_sbatch(working_dir, variants_dir, scratch=False): """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one :param str working_dir: directory where files will be created :param str variants_dir: directory where the vcf to be merged are present :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "CatVariants" output_file = "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as CatVariants: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) CatVariants.write(slurm) CatVariants.write("\n") if len(CONFIG["intervals_list"]) == 0: #in this case I need only to copy the already single file source = os.path.join( variants_dir, "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"])) dest = os.path.join( working_dir, "VCF", "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"])) CatVariants.write("cp {} {}\n".format(source, dest)) else: if scratch: CatVariants.write("mkdir -p $SNIC_TMP/{} \n".format( job_name)) # create tmp directory CatVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format( job_name)) # create tmp directory #now cycle over the intervals and build the GATK command catvariants_string_input = "" # this sorts intervals created given that they have some number in their name specifing the order CONFIG["intervals_list"].sort(key=natural_keys) for interval in CONFIG["intervals_list"]: interval_name = os.path.basename(interval).split(".")[0] vcf_interval = os.path.join( variants_dir, "{}_joincalled_{}.g.vcf.gz".format(CONFIG["output_header"], interval_name)) if scratch: CatVariants.write( "rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format( vcf_interval, job_name)) vcf_interval_name = os.path.basename(vcf_interval) vcf_interval = "$SNIC_TMP/{}/{}".format( job_name, vcf_interval_name) catvariants_string_input += "-V {} \\\n".format(vcf_interval) GATK_command = "java -cp {} org.broadinstitute.gatk.tools.CatVariants \\\n".format( CONFIG["GATK"]) for option in CONFIG["walkers"]["CatVariants"]: GATK_command += "{} \\\n".format(option) GATK_command += "{} ".format(catvariants_string_input) if scratch: GATK_command += "-out $SNIC_TMP/{}/VCF/{}\n".format( job_name, output_file) #once this is done rsync back to lupus GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format( job_name, output_file, working_dir) else: GATK_command += "-out {}/VCF/{}\n\n".format( working_dir, output_file) CatVariants.write(GATK_command) #return path to sbach file return sbatch_file
def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None): """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one. :param str working_dir: directory where files will be created :param int batch: batch number, and incremental number specifing which batch lot are we processing :param list current_batch: list containing the samples to be combined :param bool scratch: if True works on scratch :param string interval: if not none specifies a file containing the interval(s) to be combined :returns: path to the sbatch file """ name_batch1 = os.path.basename( [item for item in combined_gvcf_files if "batch1" in item][0]) interval_name = "" #there must be at least one batch so look for it, not elegant but works if name_batch1.split("batch1") != ".g.vcf.gz": interval_name = name_batch1.split("batch1")[1].split(".")[0] job_name = "GenotypeGVCFs{}".format(interval_name) output_file = "{}_joincalled{}.g.vcf.gz".format(CONFIG["output_header"], interval_name) #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as GenotypeGVCFs: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) GenotypeGVCFs.write(slurm) GenotypeGVCFs.write("\n") #rsync to scratch all samples if scratch: GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format( job_name)) # create tmp directory GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format( job_name)) # create tmp directory #now cycle over the samples, build the GATK command combined_gvcf_string_input = "" for combined_gvcf in combined_gvcf_files: combined_gvcf_path_dir = combined_gvcf if scratch: GenotypeGVCFs.write( "rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format( combined_gvcf, job_name)) combined_gvcf_name = os.path.basename(combined_gvcf) combined_gvcf_path_dir = "$SNIC_TMP/{}/{}".format( job_name, combined_gvcf_name) combined_gvcf_string_input += "-V {} \\\n".format( combined_gvcf_path_dir) GATK_command = "java -Xmx250g -jar {} -T GenotypeGVCFs \\\n".format( CONFIG["GATK"]) for option in CONFIG["walkers"]["GenotypeGVCFs"]: GATK_command += "{} \\\n".format(option) GATK_command += "{} ".format(combined_gvcf_string_input) if interval is not None: GATK_command += "-L {} \\\n".format(interval) if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format( job_name, output_file) #once this is done rsync back to lupus GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format( job_name, output_file, working_dir) else: GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file) GenotypeGVCFs.write(GATK_command) #return path to sbach file return sbatch_file
def build_VQSR_sbatch(working_dir, variant_raw, scratch=False): """Builds the sbatch file in order to run VQSR :param str working_dir: directory where files will be created :param str variant_raw: vcf containing the raw variants :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "VQSR" #first build the model for SNPS racal_file_name_snps = "{}_joincalled.snp.recal".format( CONFIG["output_header"]) tranches_file_name_snps = "{}_joincalled.snp.tranches".format( CONFIG["output_header"]) #apply the model to SNPs only variant_recal_snp_raw_indels = "{}_joincalled.recal_snp_raw_indels.vcf.gz".format( CONFIG["output_header"]) #and then build the model for INDELS racal_file_name_indels = "{}_joincalled.indel.recal".format( CONFIG["output_header"]) tranches_file_name_indels = "{}_joincalled.indel.tranches".format( CONFIG["output_header"]) variant_recal_snp_recal_indels = "{}_joincalled.recal_snp_recal_indels.vcf.gz".format( CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as VQSR: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) VQSR.write(slurm) VQSR.write("\n") ############################################## #### compute recalibration tables for SNPs ### ############################################## if scratch: VQSR.write("mkdir -p $SNIC_TMP/{} \n".format( job_name)) # create tmp directory VQSR.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format( job_name)) # create tmp directory GATK_input = "-input {} \\\n".format(variant_raw) if scratch: VQSR.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format( variant_raw, job_name)) variant_raw_name = os.path.basename(variant_raw) GATK_input = "-input $SNIC_TMP/{}/{} \\\n".format( job_name, variant_raw_name) GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator \\\n".format( CONFIG["GATK"]) #add standard options for option in CONFIG["walkers"]["VariantRecalibrator"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["VariantRecalibrator"]: if not isinstance(option, basestring) and "SNP" in option: specific_options = option["SNP"] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" GATK_command += GATK_input if scratch: GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, racal_file_name_snps) GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format( job_name, tranches_file_name_snps) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, racal_file_name_snps, working_dir) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, tranches_file_name_snps, working_dir) else: GATK_command += "-recalFile {}/VCF/{} \\\n".format( working_dir, racal_file_name_snps) GATK_command += "-tranchesFile {}/VCF/{} \\\n".format( working_dir, tranches_file_name_snps) VQSR.write(GATK_command) VQSR.write("\n") ########################################## ##### now apply recalibration for SNPs ### ########################################## GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration \\\n".format( CONFIG["GATK"]) #### GATK_input is the same if scratch: GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, racal_file_name_snps) GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, tranches_file_name_snps) else: GATK_command += "-recalFile {}/VCF/{} \\\n".format( working_dir, racal_file_name_snps) GATK_command += "-tranchesFile {}/VCF/{} \\\n".format( working_dir, tranches_file_name_snps) GATK_command += GATK_input #add standard options for option in CONFIG["walkers"]["ApplyRecalibration"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["ApplyRecalibration"]: if not isinstance(option, basestring) and "SNP" in option: specific_options = option["SNP"] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format( job_name, variant_recal_snp_raw_indels) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, variant_recal_snp_raw_indels, working_dir) else: GATK_command += "-o {}/VCF/{} \n\n".format( working_dir, variant_recal_snp_raw_indels) VQSR.write(GATK_command) VQSR.write("\n") ################################################ #### compute recalibration tables for INDELS ### ################################################ GATK_input = "-input {}/VCF/{} \\\n".format( working_dir, variant_recal_snp_raw_indels) if scratch: GATK_input = "-input $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, variant_recal_snp_raw_indels) GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator \\\n".format( CONFIG["GATK"]) #add standard options for option in CONFIG["walkers"]["VariantRecalibrator"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["VariantRecalibrator"]: if not isinstance(option, basestring) and "INDEL" in option: specific_options = option["INDEL"] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" GATK_command += GATK_input if scratch: GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, racal_file_name_indels) GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, tranches_file_name_indels) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, racal_file_name_indels, working_dir) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, tranches_file_name_indels, working_dir) else: GATK_command += "-recalFile {}/VCF/{} \\\n".format( working_dir, racal_file_name_indels) GATK_command += "-tranchesFile {}/VCF/{} \\\n".format( working_dir, tranches_file_name_indels) VQSR.write(GATK_command) VQSR.write("\n") ############################################ ##### now apply recalibration for INDELS ### ############################################ GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration \\\n".format( CONFIG["GATK"]) #### GATK_input is the same if scratch: GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, racal_file_name_indels) GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format( job_name, tranches_file_name_indels) else: GATK_command += "-recalFile {}/VCF/{} \\\n".format( working_dir, racal_file_name_indels) GATK_command += "-tranchesFile {}/VCF/{} \\\n".format( working_dir, tranches_file_name_indels) GATK_command += GATK_input #add standard options for option in CONFIG["walkers"]["ApplyRecalibration"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["ApplyRecalibration"]: if not isinstance(option, basestring) and "INDEL" in option: specific_options = option["INDEL"] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format( job_name, variant_recal_snp_recal_indels) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format( job_name, variant_recal_snp_recal_indels, working_dir) else: GATK_command += "-o {}/VCF/{} \n\n".format( working_dir, variant_recal_snp_recal_indels) VQSR.write(GATK_command) VQSR.write("\n") return sbatch_file
def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False): """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one :param str working_dir: directory where files will be created :param str variants_dir: directory where the vcf to be merged are present :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "SelectVariants" output_file_snp = "{}_joincalled.snp.g.vcf.gz".format( CONFIG["output_header"]) output_file_snp_eval = "{}_joincalled.snp.eval".format( CONFIG["output_header"]) output_file_indel = "{}_joincalled.indel.g.vcf.gz".format( CONFIG["output_header"]) output_file_indel_eval = "{}_joincalled.indel.eval".format( CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as SelectVariants: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) SelectVariants.write(slurm) SelectVariants.write("\n") if scratch: SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format( job_name)) # create tmp directory SelectVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format( job_name)) # create tmp directory GATK_input = "-V {} \\\n".format(variant_file) if scratch: SelectVariants.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format( variant_file, job_name)) variant_file_name = os.path.basename(variant_file) GATK_input = "-V $SNIC_TMP/{}/{} \\\n".format( job_name, variant_file_name) GATK_command = "java -Xmx250g -jar {} -T SelectVariants \\\n".format( CONFIG["GATK"]) for option in CONFIG["walkers"]["SelectVariants"]: GATK_command += "{} \\\n".format(option) GATK_command += GATK_input #create command for SNPs GATK_command_snp = GATK_command GATK_command_snp += "-selectType SNP \\\n" #create command for indels GATK_command_indel = GATK_command GATK_command_indel += "-selectType INDEL \\\n" if scratch: GATK_command_snp += "-o $SNIC_TMP/{}/VCF/{}\n".format( job_name, output_file_snp) GATK_command_snp += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format( job_name, output_file_snp, working_dir) GATK_command_indel += "-o $SNIC_TMP/{}/VCF/{}\n".format( job_name, output_file_indel) GATK_command_indel += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format( job_name, output_file_indel, working_dir) else: GATK_command_snp += "-o {}/VCF/{}\n\n".format( working_dir, output_file_snp) GATK_command_indel += "-o {}/VCF/{}\n\n".format( working_dir, output_file_indel) SelectVariants.write(GATK_command_snp) SelectVariants.write("\n\n") SelectVariants.write(GATK_command_indel) #now we can tun EVAL GATK_command = "java -Xmx250g -jar {} -T VariantEval -nt 16 \\\n".format( CONFIG["GATK"]) for option in CONFIG["walkers"]["VariantEval"]: GATK_command += "{} \\\n".format(option) GATK_command_snp = GATK_command + "--eval {}/VCF/{} \\\n".format( working_dir, output_file_snp) GATK_command_snp += "-o {}/VCF/{} \n".format(working_dir, output_file_snp_eval) GATK_command_indel = GATK_command + "--eval {}/VCF/{} \\\n".format( working_dir, output_file_indel) GATK_command_indel += "-o {}/VCF/{} \n".format(working_dir, output_file_indel_eval) SelectVariants.write(GATK_command_snp) SelectVariants.write("\n\n") SelectVariants.write(GATK_command_indel) #return path to sbach file return sbatch_file
def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False): """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one :param str working_dir: directory where files will be created :param str variants_dir: directory where the vcf to be merged are present :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "SelectVariants" output_file_snp = "{}_joincalled.snp.g.vcf.gz".format(CONFIG["output_header"]) output_file_snp_eval = "{}_joincalled.snp.eval".format(CONFIG["output_header"]) output_file_indel = "{}_joincalled.indel.g.vcf.gz".format(CONFIG["output_header"]) output_file_indel_eval = "{}_joincalled.indel.eval".format(CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as SelectVariants: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) SelectVariants.write(slurm) SelectVariants.write("\n") if scratch: SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory SelectVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory GATK_input = "-V {} \\\n".format(variant_file) if scratch: SelectVariants.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_file, job_name)) variant_file_name = os.path.basename(variant_file) GATK_input = "-V $SNIC_TMP/{}/{} \\\n".format(job_name, variant_file_name) GATK_command = "java -Xmx250g -jar {} -T SelectVariants \\\n".format(CONFIG["GATK"]) for option in CONFIG["walkers"]["SelectVariants"]: GATK_command += "{} \\\n".format(option) GATK_command += GATK_input #create command for SNPs GATK_command_snp = GATK_command GATK_command_snp += "-selectType SNP \\\n" #create command for indels GATK_command_indel = GATK_command GATK_command_indel += "-selectType INDEL \\\n" if scratch: GATK_command_snp += "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file_snp) GATK_command_snp += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file_snp , working_dir) GATK_command_indel += "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file_indel) GATK_command_indel += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file_indel , working_dir) else: GATK_command_snp += "-o {}/VCF/{}\n\n".format(working_dir, output_file_snp) GATK_command_indel += "-o {}/VCF/{}\n\n".format(working_dir, output_file_indel) SelectVariants.write(GATK_command_snp) SelectVariants.write("\n\n") SelectVariants.write(GATK_command_indel) #now we can tun EVAL GATK_command = "java -Xmx250g -jar {} -T VariantEval -nt 16 \\\n".format(CONFIG["GATK"]) for option in CONFIG["walkers"]["VariantEval"]: GATK_command += "{} \\\n".format(option) GATK_command_snp = GATK_command + "--eval {}/VCF/{} \\\n".format(working_dir, output_file_snp) GATK_command_snp += "-o {}/VCF/{} \n".format(working_dir, output_file_snp_eval) GATK_command_indel = GATK_command + "--eval {}/VCF/{} \\\n".format(working_dir, output_file_indel) GATK_command_indel += "-o {}/VCF/{} \n".format(working_dir, output_file_indel_eval) SelectVariants.write(GATK_command_snp) SelectVariants.write("\n\n") SelectVariants.write(GATK_command_indel) #return path to sbach file return sbatch_file
def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None): """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one. :param str working_dir: directory where files will be created :param int batch: batch number, and incremental number specifing which batch lot are we processing :param list current_batch: list containing the samples to be combined :param bool scratch: if True works on scratch :param string interval: if not none specifies a file containing the interval(s) to be combined :returns: path to the sbatch file """ job_name = "CombineGVCFs_batch{}".format(batch) output_file = "{}_batch{}.g.vcf.gz".format(CONFIG["output_header"], batch) interval_name = "" if interval is not None: interval_name = os.path.basename(interval).split(".")[ 0] # store the interval name job_name = "CombineGVCFs_batch{}_{}".format(batch, interval_name) output_file = "{}_batch{}_{}.g.vcf.gz".format(CONFIG["output_header"], batch, interval_name) #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as CombineGVCFsFile: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) CombineGVCFsFile.write(slurm) CombineGVCFsFile.write("\n") #rsync to scratch all samples if scratch: CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format( job_name)) # create tmp directory CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format( job_name)) # create tmp directory #now cycle over the samples, build the GATK command samples_string_input = "" for sample in current_batch: sample_path_dir = sample if scratch: CombineGVCFsFile.write( "rsync -rptoDLv {} $SNIC_TMP/{}/\n".format( sample, job_name)) CombineGVCFsFile.write( "rsync -rptoDLv {}.tbi $SNIC_TMP/{}/\n".format( sample, job_name)) sample_name = os.path.basename(sample) sample_path_dir = "$SNIC_TMP/{}/{}".format( job_name, sample_name) samples_string_input += "-V {} \\\n".format(sample_path_dir) GATK_command = "java -Xmx120g -jar {} -T CombineGVCFs \\\n".format( CONFIG["GATK"]) for option in CONFIG["walkers"]["CombineGVCFs"]: GATK_command += "{} \\\n".format(option) #attach the samples I am going to work with GATK_command += "{} ".format(samples_string_input) if interval is not None: GATK_command += "-L {} \\\n".format(interval) if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format( job_name, output_file) #once this is done rsync back to lupus GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format( job_name, output_file, working_dir) else: GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file) CombineGVCFsFile.write(GATK_command) #return path to sbach file return sbatch_file
def build_VariantRecalibrator_sbatch(working_dir, variant_raw, type, scratch=False): """Builds the sbatch file in order to run VQSR :param str working_dir: directory where files will be created :param str variant_raw: vcf containing the raw variants :param str type: can be SNP or INDEL and specifies which options need to be used :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "VQSR_{}".format(type) if type == "SNP": racal_file_name = "{}_joincalled.snp.recal".format(CONFIG["output_header"]) tranches_file_name = "{}_joincalled.snp.tranches".format(CONFIG["output_header"]) else: racal_file_name = "{}_joincalled.indel.recal".format(CONFIG["output_header"]) tranches_file_name = "{}_joincalled.indel.tranches".format(CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as VariantRecalibrator: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) VariantRecalibrator.write(slurm) VariantRecalibrator.write("\n") if scratch: VariantRecalibrator.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory VariantRecalibrator.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory GATK_input = "-input {} \\\n".format(variant_raw) if scratch: VariantRecalibrator.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_raw, job_name)) variant_raw_name = os.path.basename(variant_raw) GATK_input = "-input $SNIC_TMP/{}/{} \\\n".format(job_name, variant_raw_name) GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator \\\n".format(CONFIG["GATK"]) #add standard options for option in CONFIG["walkers"]["VariantRecalibrator"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["VariantRecalibrator"]: if not isinstance(option, basestring) and type in option: specific_options = option[type] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" GATK_command += GATK_input if scratch: GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name) GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, tranches_file_name) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, racal_file_name , working_dir) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, tranches_file_name , working_dir) else: GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name) GATK_command += "-tranchesFile {}/VCF/{} \n\n".format(working_dir, tranches_file_name) VariantRecalibrator.write(GATK_command) #return path to sbach file return sbatch_file
def build_ApplyRecalibration_sbatch(working_dir, variant_raw, recal, tranches, type, scratch=False): """Builds the sbatch file in order to run VQSR :param str working_dir: directory where files will be created :param str variant_raw: vcf containing the raw variants :param str type: can be SNP or INDEL and specifies which options need to be used :param bool scratch: if True works on scratch :returns: path to the sbatch file """ job_name = "ApplyRecalibration_{}".format(type) if type == "SNP": output_file_name = "{}_joincalled.snp.recalibrated.filtered.vcf.gz".format(CONFIG["output_header"]) else: output_file_name = "{}_joincalled.indel.recalibrated.filtered.vcf.gz".format(CONFIG["output_header"]) #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as ApplyRecalibration: slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir) ApplyRecalibration.write(slurm) ApplyRecalibration.write("\n") if scratch: ApplyRecalibration.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory ApplyRecalibration.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration \\\n".format(CONFIG["GATK"]) GATK_input = "-input {} \\\n".format(variant_raw) GATK_recal = "-recalFile {} \\\n".format(recal) GATK_tranches = "-tranchesFile {} \\\n".format(tranches) if scratch: ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_raw, job_name)) variant_raw_name = os.path.basename(variant_raw) ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(recal, job_name)) recal_name = os.path.basename(recal) ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(tranches, job_name)) tranches_name = os.path.basename(tranches) GATK_input = "-input $SNIC_TMP/{}/{} \\\n".format(job_name, variant_raw_name) GATK_recal = "-recalFile $SNIC_TMP/{}/{} \\\n".format(job_name ,recal_name) GATK_tranches = "-tranchesFile $SNIC_TMP/{}/{} \\\n".format(job_name, tranches_name) GATK_command += GATK_input GATK_command += GATK_recal GATK_command += GATK_tranches #add standard options for option in CONFIG["walkers"]["ApplyRecalibration"]: if isinstance(option, basestring): GATK_command += "{} \\\n".format(option) #now add specifc option for type added = False for option in CONFIG["walkers"]["ApplyRecalibration"]: if not isinstance(option, basestring) and type in option: specific_options = option[type] added = True for specific_option in specific_options: GATK_command += "{} \\\n".format(specific_option) if not added: print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" if scratch: GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, output_file_name) GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, output_file_name , working_dir) else: GATK_command += "-o {}/VCF/{} \n\n".format(working_dir, output_file_name) ApplyRecalibration.write(GATK_command) #return path to sbach file return sbatch_file