def merge_snps_and_indels(step, jobs_id=None): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format(step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T CombineVariants \\\n".format(CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "--variant:snps {} \\\n".format(CONFIG["popSNPs"]) sbatch_command += "--variant:indels {} \\\n".format(CONFIG["popINDELs"]) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, "01_merge_snp_indels", "SGP_joincalled.snp.indels.vcf.gz") sbatch_command += "-o {} \\\n".format(output) sbatch_command += "-genotypeMergeOptions PRIORITIZE \\\n" sbatch_command += "-priority snps,indels \\\n" sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as MERGE: MERGE.write(sbatch_command) slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def select(step, vcf_in, vcf_out, options, jobs_id=None): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format( step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T SelectVariants \\\n".format( CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "-V {} \\\n".format(vcf_in) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, step, vcf_out) sbatch_command += "-o {} \\\n".format(output) for option in options: sbatch_command += "{} \\\n".format(option) sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as SELECT: SELECT.write(sbatch_command) slurm_jobs_id = submit_jobs( [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def merge_snps_and_indels(step, jobs_id=None): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format( step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T CombineVariants \\\n".format( CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "--variant:snps {} \\\n".format(CONFIG["popSNPs"]) sbatch_command += "--variant:indels {} \\\n".format(CONFIG["popINDELs"]) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, "01_merge_snp_indels", "SGP_joincalled.snp.indels.vcf.gz") sbatch_command += "-o {} \\\n".format(output) sbatch_command += "-genotypeMergeOptions PRIORITIZE \\\n" sbatch_command += "-priority snps,indels \\\n" sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as MERGE: MERGE.write(sbatch_command) slurm_jobs_id = submit_jobs( [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def runPCA(folder, output, VCF, populations, jobs_id=None): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, folder)): print "WARNING: {} already present, assuming this step has been completed with success.".format( folder) return #create the folder structure os.mkdir(os.path.join(cwd, folder)) sbatch_command = sbatch_header(sbatch_name=folder, cwd=os.path.join(cwd, folder)) #create tbed output_folder = os.path.join(cwd, folder) sbatch_command += "vcftools --gzvcf {} --plink-tped --out {}/{} \n".format( VCF, output_folder, output) #run plink on this set sbatch_command += "{} -tfile {}/{} --pca --out {}/{}_PCA \n".format( CONFIG["PLINK"], output_folder, output, output_folder, output) #create PCA table with population sbatch_command += "python {} --pca {}/{}_PCA.eigenvec --populations ".format( CONFIG["PCA_to_plink"], output_folder, output) for population in populations: sbatch_command += " {} ".format(population) sbatch_command += " > {}/{}_PCA.pop.eigenvec \n".format( output_folder, output) sbatch_command += "\n" with open(os.path.join(cwd, folder, "00_runPCA.sbatch"), "w") as PCA: PCA.write(sbatch_command) slurm_jobs_id = submit_jobs( [os.path.join(cwd, folder, "00_runPCA.sbatch")], jobs_id)
def main(args): config = conf.load_yaml_config(args.configuration) if not check_configuration(): sys.exit( "ERROR: configuration file was malformed, please edit it and retry" ) #store in a file path to vcf that are going to be analysed if args.resume and os.path.exists("00_samples.txt"): sys.exit( "ERROR: -- resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... " ) if not args.resume: #create the file 00_samples.txt in order to prevent deleting by mistake analysis with open("00_samples.txt", "w") as samplesFile: for sample in CONFIG["samples_JC"]: samplesFile.write("{}\n".format(sample)) ## IMPORTANT: samples_JC contains samples to be JointCalled. ######### START JOIN CALLING OF THE VARIANTS ##################### ##### https://www.broadinstitute.org/gatk/guide/article?id=3893 ################################################################## #now join batches of batch_size samples if args.resume: #recompute only last batch of sample and, in case the extra ones sbatch_files = CombineGVCFs_resume() else: #start from scratch sbatch_files = CombineGVCFs() slurm_jobs_id = None if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files) #now perform the GenotypeGVCF step sbatch_files = GenotypeGVCFs() if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #at this point merge the chr into a single one sbatch_files = CatVariants() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #now perofmr VQSR if args.mixed_positions: sbatch_files = VQSR() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) else: #start with select variants and variant evaluation sbatch_files = SelectVariants() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #then perfomr VQSR sbatch_files = VariantRecalibrator() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #than ApplyRecalibration sbatch_files = ApplyRecalibration() if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
def main(args): config = conf.load_yaml_config(args.configuration) if not check_configuration(): sys.exit("ERROR: configuration file was malformed, please edit it and retry") #store in a file path to vcf that are going to be analysed if args.resume and os.path.exists("00_samples.txt"): sys.exit("ERROR: -- resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ") if not args.resume: #create the file 00_samples.txt in order to prevent deleting by mistake analysis with open("00_samples.txt", "w") as samplesFile: for sample in CONFIG["samples_JC"]: samplesFile.write("{}\n".format(sample)) ## IMPORTANT: samples_JC contains samples to be JointCalled. ######### START JOIN CALLING OF THE VARIANTS ##################### ##### https://www.broadinstitute.org/gatk/guide/article?id=3893 ################################################################## #now join batches of batch_size samples if args.resume: #recompute only last batch of sample and, in case the extra ones sbatch_files = CombineGVCFs_resume() else: #start from scratch sbatch_files = CombineGVCFs() slurm_jobs_id = None if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files) #now perform the GenotypeGVCF step sbatch_files = GenotypeGVCFs() if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #at this point merge the chr into a single one sbatch_files = CatVariants() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #now perofmr VQSR if args.mixed_positions: sbatch_files = VQSR() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) else: #start with select variants and variant evaluation sbatch_files = SelectVariants() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #then perfomr VQSR sbatch_files = VariantRecalibrator() #and execute if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #than ApplyRecalibration sbatch_files = ApplyRecalibration() if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
def merge_with_1KGP(step, vcf_one, vcf_two, jobs_id): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format( step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T CombineVariants \\\n".format( CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "--variant:SGP {} \\\n".format(vcf_one) sbatch_command += "--variant:1KGP {} \\\n".format(vcf_two) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, step, "1KGP_SGP.vcf.gz") sbatch_command += "-o {} \n".format(output) sbatch_command += "\n" sbatch_command += "java -Xmx250g -jar {} -T SelectVariants \\\n".format( CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) sbatch_command += "-V {} \\\n".format( os.path.join(cwd, step, "1KGP_SGP.vcf.gz")) sbatch_command += "-select \'set == \"Intersection\"\' \\\n" output = os.path.join(cwd, step, "1KGP_SGP.intersection.vcf.gz") sbatch_command += "-o {} \\\n".format(output) sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as INTERSECT: INTERSECT.write(sbatch_command) slurm_jobs_id = submit_jobs( [os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def select_EU_samples(step, vcf_all, jobs_id): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format(step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T SelectVariants \\\n".format(CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "-V {} \\\n".format(vcf_all) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, step, "EU_1KGP_SGP.vcf.gz") sbatch_command += "-o {} \\\n".format(output) sbatch_command += "-sf {}\\\n".format(CONFIG["EU_samples"]) sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as SELECT: SELECT.write(sbatch_command) slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def merge_with_1KGP(step, vcf_one, vcf_two, jobs_id): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, step)): print "WARNING: {} already present, assuming this step has been completed with success.".format(step) return #create the folder structure os.mkdir(os.path.join(cwd, step)) sbatch_command = sbatch_header(sbatch_name=step, cwd=os.path.join(cwd, step)) sbatch_command += "java -Xmx250g -jar {} -T CombineVariants \\\n".format(CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "--variant:SGP {} \\\n".format(vcf_one) sbatch_command += "--variant:1KGP {} \\\n".format(vcf_two) sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) output = os.path.join(cwd, step, "1KGP_SGP.vcf.gz") sbatch_command += "-o {} \n".format(output) sbatch_command += "\n" sbatch_command += "java -Xmx250g -jar {} -T SelectVariants \\\n".format(CONFIG["GATK"]) sbatch_command += "-R {} \\\n".format(CONFIG["reference"]) sbatch_command += "-nt 16 \\\n" sbatch_command += "-L {} \\\n".format(CONFIG["intervals"]) sbatch_command += "-V {} \\\n".format(os.path.join(cwd, step, "1KGP_SGP.vcf.gz")) sbatch_command += "-select \'set == \"Intersection\"\' \\\n" output = os.path.join(cwd, step, "1KGP_SGP.intersection.vcf.gz") sbatch_command += "-o {} \\\n".format(output) sbatch_command += "\n" with open(os.path.join(cwd, step, "{}.sbatch".format(step)), "w") as INTERSECT: INTERSECT.write(sbatch_command) slurm_jobs_id = submit_jobs([os.path.join(cwd, step, "{}.sbatch".format(step))], jobs_id) return slurm_jobs_id
def runPCA(folder, output, VCF, populations, jobs_id=None): cwd = os.getcwd() if os.path.isdir(os.path.join(cwd, folder)): print "WARNING: {} already present, assuming this step has been completed with success.".format(folder) return #create the folder structure os.mkdir(os.path.join(cwd, folder)) sbatch_command = sbatch_header(sbatch_name=folder, cwd=os.path.join(cwd, folder)) #create tbed output_folder = os.path.join(cwd, folder) sbatch_command += "vcftools --gzvcf {} --plink-tped --out {}/{} \n".format(VCF, output_folder, output) #run plink on this set sbatch_command += "{} -tfile {}/{} --pca --out {}/{}_PCA \n".format(CONFIG["PLINK"], output_folder, output, output_folder, output) #create PCA table with population sbatch_command += "python {} --pca {}/{}_PCA.eigenvec --populations ".format(CONFIG["PCA_to_plink"], output_folder, output) for population in populations: sbatch_command += " {} ".format(population) sbatch_command += " > {}/{}_PCA.pop.eigenvec \n".format(output_folder, output) sbatch_command += "\n" with open(os.path.join(cwd, folder, "00_runPCA.sbatch"), "w") as PCA: PCA.write(sbatch_command) slurm_jobs_id = submit_jobs([os.path.join(cwd, folder, "00_runPCA.sbatch")], jobs_id)