def combine_gvcf(args, recalibrate=False, all_gvcf=False): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php #combined multi-sample gVCF: gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = group_name + ".cohort.g.vcf" gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file) check_create_dir(gvcf_input_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference", input_reference, "--output", gvcf_output_full ] for root, _, files in os.walk(gvcf_input_dir): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) if all_gvcf != False: if os.path.isdir(all_gvcf): all_gvcf = os.path.abspath(all_gvcf) print("Using gvcf from enricment folder:" + all_gvcf) for root, _, files in os.walk(all_gvcf): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) else: print("GVCF enrichment folder does not exist") execute_subprocess(cmd)
def bbduk_trimming(args): """ TODO : handle params """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) output_dir = obtain_output_dir(args, "Trimmed") in1_param = "in1=" + r1 in2_param = "in2=" + r2 sample = extract_sample(r1, r2) out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz" out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz" stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats" adapter_path = "ref=" + get_bbduk_adapters() memory_param = "-Xmx" + str(args.memory) + "g" threads_param = "threads=" + str(args.threads) check_create_dir(output_dir) #bbduk.sh cmd = [ "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param, adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21", "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo", stats_param ] execute_subprocess(cmd)
def bwa_mapping(args): """ #Store output in a file when it is outputted in stdout https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) cmd_index = ["bwa", "index", reference] execute_subprocess(cmd_index) cmd_map = [ "bwa", "mem", "-t", str(args.threads), "-o", output_file, reference, r1, r2 ] execute_subprocess(cmd_map) """
def bowtie2_mapping(args): r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) if args.extensive_mapping: extensive_command = "-a" else: extensive_command = "" #bowtie2 index cmd_index = ["bowtie2-build", reference, reference] execute_subprocess(cmd_index) #bowtie map cmd_map = [ "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q", "--very-sensitive-local", "-p", str(args.threads), "-x", reference, extensive_command ] execute_subprocess(cmd_map)
def call_variants(args, recalibrate=False, group=True): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php #Call variants: gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) if not args.sample: args.sample = "nosample" file_name = args.sample #sample_name group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") vcf_output_dir = obtain_output_dir(args, "VCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") vcf_output_dir = obtain_output_dir(args, "VCF") if group: gvcf_input_file = group_name + ".cohort.g.vcf" vcf_output_file = group_name + ".cohort.raw.vcf" else: gvcf_input_file = file_name + ".g.vcf" vcf_output_file = file_name + ".raw.vcf" gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file) vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file) check_create_dir(gvcf_input_dir) check_create_dir(vcf_output_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference", input_reference, "--variant", gvcf_input_full, "--output", vcf_output_full ] execute_subprocess(cmd)
def sam_to_index_bam(args): # input_sam_path = os.path.abspath(input_sam) # if output_bam == "inputdir": # output_bam = os.path.dirname(input_sam_path) # else: # output_bam = output_bam r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" input_sam_path = os.path.join(output_dir, sample_name) input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1]) output_bam_name = input_name + ".bam" output_bam_path = os.path.join(output_dir, output_bam_name) output_bg_sorted_name = input_name + ".rg.sorted.bam" output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name) check_create_dir(output_dir) """ #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam with open(output_bam_path, "w") as outfile: #map reads and save it in th eoutput file subprocess.run(["samtools", "view", "-Sb", input_sam_path], stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True) """ cmd = [ "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path, "--threads", str(args.threads) ] execute_subprocess(cmd) check_remove_file(input_sam_path) add_SG(args, output_bam_path, output_bg_sorted_path) check_remove_file(output_bam_path) """
def picard_markdup(args): #java -jar picard.jar MarkDuplicates \ # I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt picard_jar = get_picard_path() input_bam = os.path.abspath(args.input_bam) in_param = "I=" + input_bam path_file_name = input_bam.split(".")[0] file_name = path_file_name.split("/")[-1] output_markdup = path_file_name + ".rg.markdup.bam" output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam" out_param = "O=" + output_markdup stat_output_dir = obtain_output_dir(args, "Stats") stat_output_file = file_name + ".markdup.metrics.txt" stat_output_full = os.path.join(stat_output_dir, stat_output_file) stats_param = "M=" + stat_output_full check_create_dir(stat_output_dir) cmd_markdup = [ "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param, stats_param ] execute_subprocess(cmd_markdup) #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam" cmd_sort = [ "samtools", "sort", output_markdup, "-o", output_markdup_sorted ] execute_subprocess(cmd_sort) #Handled in Haplotype Caller function #samtools index: samtools index $output_dir/$sample".sorted.bam" subprocess.run(["samtools", "index", output_markdup_sorted], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) check_remove_file(input_bam) check_remove_file(output_markdup)
def mash_screen( args, winner=True, r2=False, mash_database="/home/laura/DATABASES/Mash/refseq.genomes.k21s1000.msh" ): #https://mash.readthedocs.io/en/latest/index.html #https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab #identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment if not os.path.isfile(mash_database): print(RED + BOLD + "Mash database can't be found\n" + END_FORMATTING + "You can download it typing:\n\ wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh") sys.exit(1) threads = args.threads r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) species_output_dir = obtain_output_dir(args, "Species") check_create_dir(species_output_dir) species_output_name = sample + ".screen.tab" species_output_file = os.path.join(species_output_dir, species_output_name) cmd = ["mash", "screen", "-p", str(threads), mash_database, r1] if winner == True: cmd.insert(2, "-w") #Use both r1 and r2 instead of just r1(faster) if r2 == True: cmd.append(r2) #cmd.extend([mash_database, r1, r2]) prog = cmd[0] param = cmd[1:] try: #execute_subprocess(cmd) with open(species_output_file, "w+") as outfile: #calculate mash distance and save it in output file command = subprocess.run(cmd, stdout=outfile, stderr=subprocess.PIPE, universal_newlines=True) if command.returncode == 0: print(GREEN + "Program %s successfully executed" % prog + END_FORMATTING) else: print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING + BOLD + "WITH PARAMETERS: " + END_FORMATTING + " ".join(param) + "\n" + BOLD + "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" + END_FORMATTING + command.stderr) except OSError as e: sys.exit(RED + BOLD + "failed to execute program '%s': %s" % (prog, str(e)) + END_FORMATTING)
def recalibrate_bam(args, tb=False): """ BaseRecalibrator https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php #Recalibrate bam: gatk BaseRecalibrator --input my_reads.bam --reference reference.fasta --known-sites sites_of_variation.vcf \ --known-sites another/optional/setOfSitesToMask.vcf --output recal_data.table ApplyBQSR https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php gatk ApplyBQSR --reference reference.fasta --input input.bam --bqsr-recal-file recalibration.table --output output.bam """ #output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) #Automate M. tuberculosis reference for aditional recalibraion positions if ("NC_000962.3" in input_reference) or ( "h37rv" in input_reference.lower()) or ("ancestor" in input_reference): tb = True script_dir = os.path.dirname(os.path.realpath(__file__)) reference_dir = os.path.join(script_dir, "reference") if ("NC_000962.3" in input_reference) or ("h37rv" in input_reference.lower()): reference_file = os.path.join( reference_dir, "190508_ddtb.NC_000962.3.BQSR.table") elif ("ancestor" in input_reference): reference_file = os.path.join(reference_dir, "190508_ddtb.BQSR.table") #group_name = output.split("/")[-1] #group_name sample_name = args.sample bam_input_dir = obtain_output_dir(args, "Bam") vcf_input_dir = obtain_output_dir(args, "VCF_recal") bam_input_file_name = sample_name + ".rg.markdup.sorted.bam" bam_input_file = os.path.join(bam_input_dir, bam_input_file_name) table_output_file_name = sample_name + ".recall.table" table_output_file = os.path.join(vcf_input_dir, table_output_file_name) memory_param = "-Xmx" + str(args.memory) + "g" #BaseRecalibrator cmd_bqsr = [ "gatk", "BaseRecalibrator", "--java-options", memory_param, "--reference", input_reference, "--input", bam_input_file, "--output", table_output_file ] if tb == True: cmd_bqsr.append("--known-sites") cmd_bqsr.append(reference_file) for root, _, files in os.walk(vcf_input_dir): for name in files: filename = os.path.join(root, name) if filename.endswith(".hf.pass.vcf"): cmd_bqsr.append("--known-sites") cmd_bqsr.append(filename) execute_subprocess(cmd_bqsr) #ApplyBQSR bam_output_file_name = sample_name + ".bqsr.bam" bam_output_file = os.path.join(bam_input_dir, bam_output_file_name) cmd_apply = [ "gatk", "ApplyBQSR", "--reference", input_reference, "--input", bam_input_file, "--bqsr-recal-file", table_output_file, "--output", bam_output_file ] execute_subprocess(cmd_apply)
def haplotype_caller(args, recalibrate=False, ploidy=2, bamout=False, forceactive=False, intervals=False): #base_quality=13, """ #No excuses https://software.broadinstitute.org/gatk/documentation/article?id=11081 """ #input_bam = os.path.abspath(args.input_bam) input_reference = os.path.abspath(args.reference) bam_output_dir = obtain_output_dir(args, "Bam") #file_name = path_file_name.split("/")[-1] #sample_name file_name = args.sample #path_file_name = os.path.join(output_dir, gvcf_output_file) if recalibrate: input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF_recal") gvcf_output_file = file_name + ".g.vcf" else: input_bam_to_call_name = file_name + ".bqsr.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = file_name + ".g.vcf" check_create_dir(gvcf_output_dir) input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name) gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file) memory_param = "-Xmx" + str(args.memory) + "g" hc_args = [ "gatk", "HaplotypeCaller", "--java-options", memory_param, "--reference", input_reference, "--input", input_bam_to_call, "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF", "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy", str(ploidy) ] #"--min-base-quality-score", str(base_quality), #Create bam index #cmd_index = ["samtools", "index", input_bam_to_call] #execute_subprocess(cmd_index) if bamout: bamout_output_dir = obtain_output_dir(args, "Bamout") bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam" bamout_output_full = os.path.join(bamout_output_dir, bamout_output_file) check_create_dir(bamout_output_dir) bamout_params = ["--bam-output", bamout_output_full] hc_args.extend(bamout_params) if forceactive: force_params = ["--force-active", "--disable-optimizations"] hc_args.extend(force_params) execute_subprocess(hc_args) """