Esempio n. 1
0
def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None):
    """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one.
    
    :param str working_dir: directory where files will be created
    :param int batch: batch number, and incremental number specifing which batch lot are we processing
    :param list current_batch: list containing the samples to be combined
    :param bool scratch: if True works on scratch
    :param string interval: if not none specifies a file containing the interval(s) to be combined
    
    :returns: path to the sbatch file 
    
    """
    
    job_name      = "CombineGVCFs_batch{}".format(batch)
    output_file   = "{}_batch{}.g.vcf.gz".format(CONFIG["output_header"], batch)
    interval_name = ""
    if interval is not None:
            interval_name = os.path.basename(interval).split(".")[0] # store the interval name
            job_name    = "CombineGVCFs_batch{}_{}".format(batch, interval_name)
            output_file = "{}_batch{}_{}.g.vcf.gz".format(CONFIG["output_header"], batch, interval_name)
    #create the sbatch file to analyse the current batch of samples
    sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as CombineGVCFsFile:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        CombineGVCFsFile.write(slurm)
        CombineGVCFsFile.write("\n")
        #rsync to scratch all samples
        if scratch:
            CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
            CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory
        #now cycle over the samples, build the GATK command
        samples_string_input = ""
        for sample in current_batch:
            sample_path_dir = sample
            if scratch:
                CombineGVCFsFile.write("rsync -rptoDLv {} $SNIC_TMP/{}/\n".format(sample, job_name))
                CombineGVCFsFile.write("rsync -rptoDLv {}.tbi $SNIC_TMP/{}/\n".format(sample, job_name))
                sample_name = os.path.basename(sample)
                sample_path_dir = "$SNIC_TMP/{}/{}".format(job_name, sample_name)
            samples_string_input += "-V {} \\\n".format(sample_path_dir)
        GATK_command= "java -Xmx120g -jar {} -T CombineGVCFs \\\n".format(CONFIG["GATK"])
        for option in CONFIG["walkers"]["CombineGVCFs"]:
            GATK_command += "{} \\\n".format(option)
        #attach the samples I am going to work with
        GATK_command += "{} ".format(samples_string_input)
        if interval is not None:
            GATK_command += "-L {} \\\n".format(interval)
        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file)
            #once this is done rsync back to lupus
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file , working_dir)
        else:
            GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file)
        CombineGVCFsFile.write(GATK_command)
    #return path to sbach file
    return sbatch_file
def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None):
    """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one.
    
    :param str working_dir: directory where files will be created
    :param int batch: batch number, and incremental number specifing which batch lot are we processing
    :param list current_batch: list containing the samples to be combined
    :param bool scratch: if True works on scratch
    :param string interval: if not none specifies a file containing the interval(s) to be combined
    
    :returns: path to the sbatch file 
    
    """
    
    name_batch1   = os.path.basename([item for item in combined_gvcf_files if "batch1" in item][0])
    interval_name = ""
    #there must be at least one batch so look for it, not elegant but works
    if name_batch1.split("batch1") != ".g.vcf.gz":
        interval_name = name_batch1.split("batch1")[1].split(".")[0]
    job_name      = "GenotypeGVCFs{}".format(interval_name)
    output_file   = "{}_joincalled{}.g.vcf.gz".format(CONFIG["output_header"], interval_name)
    #create the sbatch file to analyse the current batch of samples
    sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as GenotypeGVCFs:
        slurm = slurm_header(CONFIG["uppmax_project"],  job_name, working_dir)
        GenotypeGVCFs.write(slurm)
        GenotypeGVCFs.write("\n")
        #rsync to scratch all samples
        if scratch:
            GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
            GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory
        #now cycle over the samples, build the GATK command
        combined_gvcf_string_input = ""
        for combined_gvcf in combined_gvcf_files:
            combined_gvcf_path_dir = combined_gvcf
            if scratch:
                GenotypeGVCFs.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(combined_gvcf, job_name))
                combined_gvcf_name = os.path.basename(combined_gvcf)
                combined_gvcf_path_dir = "$SNIC_TMP/{}/{}".format(job_name, combined_gvcf_name)
            combined_gvcf_string_input += "-V {} \\\n".format(combined_gvcf_path_dir)

        GATK_command= "java -Xmx250g -jar {} -T GenotypeGVCFs  \\\n".format(CONFIG["GATK"])
        for option in CONFIG["walkers"]["GenotypeGVCFs"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command += "{} ".format(combined_gvcf_string_input)
        if interval is not None:
            GATK_command += "-L {} \\\n".format(interval)

        if scratch:
            GATK_command +=  "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file)
            #once this is done rsync back to lupus
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file , working_dir)
        else:
            GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file)
        GenotypeGVCFs.write(GATK_command)
    #return path to sbach file
    return sbatch_file
Esempio n. 3
0
def build_CatVariants_sbatch(working_dir, variants_dir, scratch=False):
    """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one
    
    :param str working_dir: directory where files will be created
    :param str variants_dir: directory where the vcf to be merged are present
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """
    job_name = "CatVariants"
    output_file = "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch",
                               "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as CatVariants:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        CatVariants.write(slurm)
        CatVariants.write("\n")

        if len(CONFIG["intervals_list"]) == 0:
            #in this case I need only to copy the already single file
            source = os.path.join(
                variants_dir,
                "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"]))
            dest = os.path.join(
                working_dir, "VCF",
                "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"]))
            CatVariants.write("cp {} {}\n".format(source, dest))
        else:
            if scratch:
                CatVariants.write("mkdir -p $SNIC_TMP/{} \n".format(
                    job_name))  # create tmp directory
                CatVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(
                    job_name))  # create tmp directory
            #now cycle over the intervals and build the GATK command
            catvariants_string_input = ""
            # this sorts intervals created given that they have some number in their name specifing the order
            CONFIG["intervals_list"].sort(key=natural_keys)
            for interval in CONFIG["intervals_list"]:
                interval_name = os.path.basename(interval).split(".")[0]
                vcf_interval = os.path.join(
                    variants_dir,
                    "{}_joincalled_{}.g.vcf.gz".format(CONFIG["output_header"],
                                                       interval_name))
                if scratch:
                    CatVariants.write(
                        "rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(
                            vcf_interval, job_name))
                    vcf_interval_name = os.path.basename(vcf_interval)
                    vcf_interval = "$SNIC_TMP/{}/{}".format(
                        job_name, vcf_interval_name)
                catvariants_string_input += "-V {} \\\n".format(vcf_interval)

            GATK_command = "java -cp {} org.broadinstitute.gatk.tools.CatVariants \\\n".format(
                CONFIG["GATK"])
            for option in CONFIG["walkers"]["CatVariants"]:
                GATK_command += "{} \\\n".format(option)
            GATK_command += "{} ".format(catvariants_string_input)
            if scratch:
                GATK_command += "-out $SNIC_TMP/{}/VCF/{}\n".format(
                    job_name, output_file)
                #once this is done rsync back to lupus
                GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(
                    job_name, output_file, working_dir)
            else:
                GATK_command += "-out {}/VCF/{}\n\n".format(
                    working_dir, output_file)
            CatVariants.write(GATK_command)
    #return path to sbach file
    return sbatch_file
Esempio n. 4
0
def build_GenotypeGVCFs_sbatch(working_dir,
                               combined_gvcf_files,
                               scratch=False,
                               interval=None):
    """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one.
    
    :param str working_dir: directory where files will be created
    :param int batch: batch number, and incremental number specifing which batch lot are we processing
    :param list current_batch: list containing the samples to be combined
    :param bool scratch: if True works on scratch
    :param string interval: if not none specifies a file containing the interval(s) to be combined
    
    :returns: path to the sbatch file 
    
    """

    name_batch1 = os.path.basename(
        [item for item in combined_gvcf_files if "batch1" in item][0])
    interval_name = ""
    #there must be at least one batch so look for it, not elegant but works
    if name_batch1.split("batch1") != ".g.vcf.gz":
        interval_name = name_batch1.split("batch1")[1].split(".")[0]
    job_name = "GenotypeGVCFs{}".format(interval_name)
    output_file = "{}_joincalled{}.g.vcf.gz".format(CONFIG["output_header"],
                                                    interval_name)
    #create the sbatch file to analyse the current batch of samples
    sbatch_file = os.path.join(working_dir, "sbatch",
                               "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as GenotypeGVCFs:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        GenotypeGVCFs.write(slurm)
        GenotypeGVCFs.write("\n")
        #rsync to scratch all samples
        if scratch:
            GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(
                job_name))  # create tmp directory
            GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(
                job_name))  # create tmp directory
        #now cycle over the samples, build the GATK command
        combined_gvcf_string_input = ""
        for combined_gvcf in combined_gvcf_files:
            combined_gvcf_path_dir = combined_gvcf
            if scratch:
                GenotypeGVCFs.write(
                    "rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(
                        combined_gvcf, job_name))
                combined_gvcf_name = os.path.basename(combined_gvcf)
                combined_gvcf_path_dir = "$SNIC_TMP/{}/{}".format(
                    job_name, combined_gvcf_name)
            combined_gvcf_string_input += "-V {} \\\n".format(
                combined_gvcf_path_dir)

        GATK_command = "java -Xmx250g -jar {} -T GenotypeGVCFs  \\\n".format(
            CONFIG["GATK"])
        for option in CONFIG["walkers"]["GenotypeGVCFs"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command += "{} ".format(combined_gvcf_string_input)
        if interval is not None:
            GATK_command += "-L {} \\\n".format(interval)

        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format(
                job_name, output_file)
            #once this is done rsync back to lupus
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(
                job_name, output_file, working_dir)
        else:
            GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file)
        GenotypeGVCFs.write(GATK_command)
    #return path to sbach file
    return sbatch_file
Esempio n. 5
0
def build_VQSR_sbatch(working_dir, variant_raw, scratch=False):
    """Builds the sbatch file in order to run VQSR
    
    :param str working_dir: directory where files will be created
    :param str variant_raw: vcf containing the raw variants
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """

    job_name = "VQSR"
    #first build the model for SNPS
    racal_file_name_snps = "{}_joincalled.snp.recal".format(
        CONFIG["output_header"])
    tranches_file_name_snps = "{}_joincalled.snp.tranches".format(
        CONFIG["output_header"])
    #apply the model to SNPs only
    variant_recal_snp_raw_indels = "{}_joincalled.recal_snp_raw_indels.vcf.gz".format(
        CONFIG["output_header"])
    #and then build the model for INDELS
    racal_file_name_indels = "{}_joincalled.indel.recal".format(
        CONFIG["output_header"])
    tranches_file_name_indels = "{}_joincalled.indel.tranches".format(
        CONFIG["output_header"])
    variant_recal_snp_recal_indels = "{}_joincalled.recal_snp_recal_indels.vcf.gz".format(
        CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch",
                               "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as VQSR:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        VQSR.write(slurm)
        VQSR.write("\n")
        ##############################################
        #### compute recalibration tables for SNPs ###
        ##############################################
        if scratch:
            VQSR.write("mkdir -p $SNIC_TMP/{} \n".format(
                job_name))  # create tmp directory
            VQSR.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(
                job_name))  # create tmp directory
        GATK_input = "-input {} \\\n".format(variant_raw)
        if scratch:
            VQSR.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(
                variant_raw, job_name))
            variant_raw_name = os.path.basename(variant_raw)
            GATK_input = "-input $SNIC_TMP/{}/{} \\\n".format(
                job_name, variant_raw_name)

        GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator  \\\n".format(
            CONFIG["GATK"])
        #add standard options
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if not isinstance(option, basestring) and "SNP" in option:
                specific_options = option["SNP"]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"
        GATK_command += GATK_input
        if scratch:
            GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, racal_file_name_snps)
            GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format(
                job_name, tranches_file_name_snps)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, racal_file_name_snps, working_dir)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, tranches_file_name_snps, working_dir)
        else:
            GATK_command += "-recalFile {}/VCF/{} \\\n".format(
                working_dir, racal_file_name_snps)
            GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(
                working_dir, tranches_file_name_snps)
        VQSR.write(GATK_command)
        VQSR.write("\n")
        ##########################################
        ##### now apply recalibration for SNPs ###
        ##########################################
        GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration  \\\n".format(
            CONFIG["GATK"])
        #### GATK_input is the same
        if scratch:
            GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, racal_file_name_snps)
            GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, tranches_file_name_snps)
        else:
            GATK_command += "-recalFile {}/VCF/{} \\\n".format(
                working_dir, racal_file_name_snps)
            GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(
                working_dir, tranches_file_name_snps)
        GATK_command += GATK_input
        #add standard options
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if not isinstance(option, basestring) and "SNP" in option:
                specific_options = option["SNP"]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"

        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(
                job_name, variant_recal_snp_raw_indels)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, variant_recal_snp_raw_indels, working_dir)
        else:
            GATK_command += "-o {}/VCF/{} \n\n".format(
                working_dir, variant_recal_snp_raw_indels)
        VQSR.write(GATK_command)
        VQSR.write("\n")
        ################################################
        #### compute recalibration tables for INDELS ###
        ################################################
        GATK_input = "-input {}/VCF/{} \\\n".format(
            working_dir, variant_recal_snp_raw_indels)
        if scratch:
            GATK_input = "-input $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, variant_recal_snp_raw_indels)
        GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator  \\\n".format(
            CONFIG["GATK"])
        #add standard options
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if not isinstance(option, basestring) and "INDEL" in option:
                specific_options = option["INDEL"]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"
        GATK_command += GATK_input
        if scratch:
            GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, racal_file_name_indels)
            GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, tranches_file_name_indels)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, racal_file_name_indels, working_dir)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, tranches_file_name_indels, working_dir)
        else:
            GATK_command += "-recalFile {}/VCF/{} \\\n".format(
                working_dir, racal_file_name_indels)
            GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(
                working_dir, tranches_file_name_indels)
        VQSR.write(GATK_command)
        VQSR.write("\n")
        ############################################
        ##### now apply recalibration for INDELS ###
        ############################################
        GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration  \\\n".format(
            CONFIG["GATK"])
        #### GATK_input is the same
        if scratch:
            GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, racal_file_name_indels)
            GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format(
                job_name, tranches_file_name_indels)
        else:
            GATK_command += "-recalFile {}/VCF/{} \\\n".format(
                working_dir, racal_file_name_indels)
            GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(
                working_dir, tranches_file_name_indels)
        GATK_command += GATK_input
        #add standard options
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if not isinstance(option, basestring) and "INDEL" in option:
                specific_options = option["INDEL"]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"

        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(
                job_name, variant_recal_snp_recal_indels)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(
                job_name, variant_recal_snp_recal_indels, working_dir)
        else:
            GATK_command += "-o {}/VCF/{} \n\n".format(
                working_dir, variant_recal_snp_recal_indels)

        VQSR.write(GATK_command)
        VQSR.write("\n")

    return sbatch_file
def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False):
    """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one
    
    :param str working_dir: directory where files will be created
    :param str variants_dir: directory where the vcf to be merged are present
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """
    job_name = "SelectVariants"
    output_file_snp = "{}_joincalled.snp.g.vcf.gz".format(
        CONFIG["output_header"])
    output_file_snp_eval = "{}_joincalled.snp.eval".format(
        CONFIG["output_header"])
    output_file_indel = "{}_joincalled.indel.g.vcf.gz".format(
        CONFIG["output_header"])
    output_file_indel_eval = "{}_joincalled.indel.eval".format(
        CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch",
                               "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as SelectVariants:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        SelectVariants.write(slurm)
        SelectVariants.write("\n")

        if scratch:
            SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(
                job_name))  # create tmp directory
            SelectVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(
                job_name))  # create tmp directory

        GATK_input = "-V {} \\\n".format(variant_file)
        if scratch:
            SelectVariants.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(
                variant_file, job_name))
            variant_file_name = os.path.basename(variant_file)
            GATK_input = "-V $SNIC_TMP/{}/{} \\\n".format(
                job_name, variant_file_name)

        GATK_command = "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(
            CONFIG["GATK"])
        for option in CONFIG["walkers"]["SelectVariants"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command += GATK_input
        #create command for SNPs
        GATK_command_snp = GATK_command
        GATK_command_snp += "-selectType SNP \\\n"
        #create command for indels
        GATK_command_indel = GATK_command
        GATK_command_indel += "-selectType INDEL \\\n"
        if scratch:
            GATK_command_snp += "-o $SNIC_TMP/{}/VCF/{}\n".format(
                job_name, output_file_snp)
            GATK_command_snp += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(
                job_name, output_file_snp, working_dir)
            GATK_command_indel += "-o $SNIC_TMP/{}/VCF/{}\n".format(
                job_name, output_file_indel)
            GATK_command_indel += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(
                job_name, output_file_indel, working_dir)
        else:
            GATK_command_snp += "-o {}/VCF/{}\n\n".format(
                working_dir, output_file_snp)
            GATK_command_indel += "-o {}/VCF/{}\n\n".format(
                working_dir, output_file_indel)
        SelectVariants.write(GATK_command_snp)
        SelectVariants.write("\n\n")
        SelectVariants.write(GATK_command_indel)
        #now we can tun EVAL
        GATK_command = "java -Xmx250g -jar {} -T VariantEval -nt 16 \\\n".format(
            CONFIG["GATK"])
        for option in CONFIG["walkers"]["VariantEval"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command_snp = GATK_command + "--eval {}/VCF/{} \\\n".format(
            working_dir, output_file_snp)
        GATK_command_snp += "-o {}/VCF/{} \n".format(working_dir,
                                                     output_file_snp_eval)
        GATK_command_indel = GATK_command + "--eval {}/VCF/{} \\\n".format(
            working_dir, output_file_indel)
        GATK_command_indel += "-o {}/VCF/{} \n".format(working_dir,
                                                       output_file_indel_eval)
        SelectVariants.write(GATK_command_snp)
        SelectVariants.write("\n\n")
        SelectVariants.write(GATK_command_indel)
    #return path to sbach file
    return sbatch_file
def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False):
    """Builds the sbatch file in order to combine genomics.vcf divided up in chr into a single one
    
    :param str working_dir: directory where files will be created
    :param str variants_dir: directory where the vcf to be merged are present
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """
    job_name       = "SelectVariants"
    output_file_snp        = "{}_joincalled.snp.g.vcf.gz".format(CONFIG["output_header"])
    output_file_snp_eval   = "{}_joincalled.snp.eval".format(CONFIG["output_header"])
    output_file_indel      = "{}_joincalled.indel.g.vcf.gz".format(CONFIG["output_header"])
    output_file_indel_eval = "{}_joincalled.indel.eval".format(CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as SelectVariants:
	slurm = slurm_header(CONFIG["uppmax_project"],  job_name, working_dir)
        SelectVariants.write(slurm)
        SelectVariants.write("\n")
        
        if scratch:
            SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
            SelectVariants.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory
        
        GATK_input = "-V {} \\\n".format(variant_file)
        if scratch:
            SelectVariants.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_file, job_name))
            variant_file_name = os.path.basename(variant_file)
            GATK_input  = "-V $SNIC_TMP/{}/{} \\\n".format(job_name, variant_file_name)
        
        GATK_command  = "java -Xmx250g -jar {} -T SelectVariants  \\\n".format(CONFIG["GATK"])
        for option in CONFIG["walkers"]["SelectVariants"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command += GATK_input
        #create command for SNPs
        GATK_command_snp = GATK_command
        GATK_command_snp += "-selectType SNP \\\n"
        #create command for indels
        GATK_command_indel = GATK_command
        GATK_command_indel += "-selectType INDEL \\\n"
        if scratch:
            GATK_command_snp +=  "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file_snp)
            GATK_command_snp += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file_snp , working_dir)
            GATK_command_indel +=  "-o $SNIC_TMP/{}/VCF/{}\n".format(job_name, output_file_indel)
            GATK_command_indel += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(job_name, output_file_indel , working_dir)
        else:
            GATK_command_snp   += "-o {}/VCF/{}\n\n".format(working_dir, output_file_snp)
            GATK_command_indel += "-o {}/VCF/{}\n\n".format(working_dir, output_file_indel)
        SelectVariants.write(GATK_command_snp)
        SelectVariants.write("\n\n")
        SelectVariants.write(GATK_command_indel)
        #now we can tun EVAL
        GATK_command = "java -Xmx250g -jar {} -T VariantEval -nt 16 \\\n".format(CONFIG["GATK"])
        for option in CONFIG["walkers"]["VariantEval"]:
            GATK_command += "{} \\\n".format(option)
        GATK_command_snp = GATK_command + "--eval {}/VCF/{} \\\n".format(working_dir, output_file_snp)
        GATK_command_snp += "-o {}/VCF/{} \n".format(working_dir, output_file_snp_eval)
        GATK_command_indel = GATK_command + "--eval {}/VCF/{} \\\n".format(working_dir, output_file_indel)
        GATK_command_indel += "-o {}/VCF/{} \n".format(working_dir, output_file_indel_eval)
        SelectVariants.write(GATK_command_snp)
        SelectVariants.write("\n\n")
        SelectVariants.write(GATK_command_indel)
        #return path to sbach file
    return sbatch_file
def build_CombineGVCFs_sbatch(working_dir,
                              batch,
                              current_batch,
                              scratch=False,
                              interval=None):
    """Builds the sbatch file in order to combine genomics.vcf samples contained in current_batch in a single one.
    
    :param str working_dir: directory where files will be created
    :param int batch: batch number, and incremental number specifing which batch lot are we processing
    :param list current_batch: list containing the samples to be combined
    :param bool scratch: if True works on scratch
    :param string interval: if not none specifies a file containing the interval(s) to be combined
    
    :returns: path to the sbatch file 
    
    """

    job_name = "CombineGVCFs_batch{}".format(batch)
    output_file = "{}_batch{}.g.vcf.gz".format(CONFIG["output_header"], batch)
    interval_name = ""
    if interval is not None:
        interval_name = os.path.basename(interval).split(".")[
            0]  # store the interval name
        job_name = "CombineGVCFs_batch{}_{}".format(batch, interval_name)
        output_file = "{}_batch{}_{}.g.vcf.gz".format(CONFIG["output_header"],
                                                      batch, interval_name)
    #create the sbatch file to analyse the current batch of samples
    sbatch_file = os.path.join(working_dir, "sbatch",
                               "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as CombineGVCFsFile:
        slurm = slurm_header(CONFIG["uppmax_project"], job_name, working_dir)
        CombineGVCFsFile.write(slurm)
        CombineGVCFsFile.write("\n")
        #rsync to scratch all samples
        if scratch:
            CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(
                job_name))  # create tmp directory
            CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(
                job_name))  # create tmp directory
        #now cycle over the samples, build the GATK command
        samples_string_input = ""
        for sample in current_batch:
            sample_path_dir = sample
            if scratch:
                CombineGVCFsFile.write(
                    "rsync -rptoDLv {} $SNIC_TMP/{}/\n".format(
                        sample, job_name))
                CombineGVCFsFile.write(
                    "rsync -rptoDLv {}.tbi $SNIC_TMP/{}/\n".format(
                        sample, job_name))
                sample_name = os.path.basename(sample)
                sample_path_dir = "$SNIC_TMP/{}/{}".format(
                    job_name, sample_name)
            samples_string_input += "-V {} \\\n".format(sample_path_dir)
        GATK_command = "java -Xmx120g -jar {} -T CombineGVCFs \\\n".format(
            CONFIG["GATK"])
        for option in CONFIG["walkers"]["CombineGVCFs"]:
            GATK_command += "{} \\\n".format(option)
        #attach the samples I am going to work with
        GATK_command += "{} ".format(samples_string_input)
        if interval is not None:
            GATK_command += "-L {} \\\n".format(interval)
        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{}\n".format(
                job_name, output_file)
            #once this is done rsync back to lupus
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/\n".format(
                job_name, output_file, working_dir)
        else:
            GATK_command += "-o {}/VCF/{}\n\n".format(working_dir, output_file)
        CombineGVCFsFile.write(GATK_command)
    #return path to sbach file
    return sbatch_file
def build_VariantRecalibrator_sbatch(working_dir,  variant_raw, type, scratch=False):
    """Builds the sbatch file in order to run VQSR
    
    :param str working_dir: directory where files will be created
    :param str variant_raw: vcf containing the raw variants
    :param str type: can be SNP or INDEL and specifies which options need to be used
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """
    job_name           = "VQSR_{}".format(type)
    if type == "SNP":
        racal_file_name    = "{}_joincalled.snp.recal".format(CONFIG["output_header"])
        tranches_file_name = "{}_joincalled.snp.tranches".format(CONFIG["output_header"])
    else:
        racal_file_name    = "{}_joincalled.indel.recal".format(CONFIG["output_header"])
        tranches_file_name = "{}_joincalled.indel.tranches".format(CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as VariantRecalibrator:
        slurm = slurm_header(CONFIG["uppmax_project"],  job_name, working_dir)
        VariantRecalibrator.write(slurm)
        VariantRecalibrator.write("\n")
        
        if scratch:
            VariantRecalibrator.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
            VariantRecalibrator.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory
        
        GATK_input = "-input {} \\\n".format(variant_raw)
        if scratch:
            VariantRecalibrator.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_raw, job_name))
            variant_raw_name = os.path.basename(variant_raw)
            GATK_input  = "-input $SNIC_TMP/{}/{} \\\n".format(job_name, variant_raw_name)
        
        GATK_command  = "java -Xmx64g -jar {} -T VariantRecalibrator  \\\n".format(CONFIG["GATK"])
        #add standard options
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["VariantRecalibrator"]:
            if not isinstance(option, basestring) and type in option:
                specific_options = option[type]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"


        GATK_command += GATK_input
        if scratch:
            GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name)
            GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, tranches_file_name)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, racal_file_name , working_dir)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, tranches_file_name , working_dir)
        else:
            GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name)
            GATK_command += "-tranchesFile {}/VCF/{} \n\n".format(working_dir, tranches_file_name)

        VariantRecalibrator.write(GATK_command)
        #return path to sbach file
    return sbatch_file
def build_ApplyRecalibration_sbatch(working_dir, variant_raw,  recal, tranches, type, scratch=False):
    """Builds the sbatch file in order to run VQSR
    
    :param str working_dir: directory where files will be created
    :param str variant_raw: vcf containing the raw variants
    :param str type: can be SNP or INDEL and specifies which options need to be used
    :param bool scratch: if True works on scratch
    
    :returns: path to the sbatch file
    """
    
    job_name           = "ApplyRecalibration_{}".format(type)
    if type == "SNP":
        output_file_name    = "{}_joincalled.snp.recalibrated.filtered.vcf.gz".format(CONFIG["output_header"])
    else:
        output_file_name    = "{}_joincalled.indel.recalibrated.filtered.vcf.gz".format(CONFIG["output_header"])
    #create the sbatch file to merge all varaints or to copy the already single one
    sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
    with open(sbatch_file, "w") as ApplyRecalibration:
	slurm = slurm_header(CONFIG["uppmax_project"],  job_name, working_dir)
        ApplyRecalibration.write(slurm)
        ApplyRecalibration.write("\n")
        
        if scratch:
            ApplyRecalibration.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
            ApplyRecalibration.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory
        
        GATK_command  = "java -Xmx64g -jar {} -T ApplyRecalibration  \\\n".format(CONFIG["GATK"])
        GATK_input = "-input {} \\\n".format(variant_raw)
        GATK_recal = "-recalFile {} \\\n".format(recal)
        GATK_tranches = "-tranchesFile {} \\\n".format(tranches)
        if scratch:
            ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_raw, job_name))
            variant_raw_name = os.path.basename(variant_raw)
            ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(recal, job_name))
            recal_name = os.path.basename(recal)
            ApplyRecalibration.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(tranches, job_name))
            tranches_name = os.path.basename(tranches)
            GATK_input    = "-input $SNIC_TMP/{}/{} \\\n".format(job_name, variant_raw_name)
            GATK_recal    = "-recalFile $SNIC_TMP/{}/{} \\\n".format(job_name ,recal_name)
            GATK_tranches = "-tranchesFile $SNIC_TMP/{}/{} \\\n".format(job_name, tranches_name)
        GATK_command += GATK_input
        GATK_command += GATK_recal
        GATK_command += GATK_tranches
        #add standard options
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if isinstance(option, basestring):
                GATK_command += "{} \\\n".format(option)
        #now add specifc option for type
        added = False
        for option in CONFIG["walkers"]["ApplyRecalibration"]:
            if not isinstance(option, basestring) and type in option:
                specific_options = option[type]
                added = True
                for specific_option in specific_options:
                    GATK_command += "{} \\\n".format(specific_option)
        if not added:
            print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option"

        if scratch:
            GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, output_file_name)
            GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, output_file_name , working_dir)
        else:
            GATK_command += "-o {}/VCF/{} \n\n".format(working_dir, output_file_name)

        ApplyRecalibration.write(GATK_command)
        #return path to sbach file
    return sbatch_file