def bowtie2_mapping(args): r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) if args.extensive_mapping: extensive_command = "-a" else: extensive_command = "" #bowtie2 index cmd_index = ["bowtie2-build", reference, reference] execute_subprocess(cmd_index) #bowtie map cmd_map = [ "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q", "--very-sensitive-local", "-p", str(args.threads), "-x", reference, extensive_command ] execute_subprocess(cmd_map)
def bbduk_trimming(args): """ TODO : handle params """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) output_dir = obtain_output_dir(args, "Trimmed") in1_param = "in1=" + r1 in2_param = "in2=" + r2 sample = extract_sample(r1, r2) out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz" out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz" stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats" adapter_path = "ref=" + get_bbduk_adapters() memory_param = "-Xmx" + str(args.memory) + "g" threads_param = "threads=" + str(args.threads) check_create_dir(output_dir) #bbduk.sh cmd = [ "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param, adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21", "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo", stats_param ] execute_subprocess(cmd)
def select_variants(raw_vcf, select_type='SNP'): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php gatk SelectVariants -V cohort.vcf.gz -select-type SNP -O snps.vcf.gz """ if select_type == "SNP": extension = ".snp.vcf" elif select_type == "INDEL": extension = ".indel.vcf" else: print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING) input_vcf = os.path.abspath(raw_vcf) check_file_exists(input_vcf) raw_vcf_file_name = (".").join(input_vcf.split(".")[:-2]) #file_name = raw_vcf_file_name.split("/")[-1] #sample_name vcf_selected_output_file = raw_vcf_file_name + extension #memory_param = "-Xmx" + str(args.memory) + "g" #"--java-options", memory_param, cmd = [ "gatk", "SelectVariants", "--variant", input_vcf, "--select-type-to-include", select_type, "--select-type-to-include", "MIXED", "--output", vcf_selected_output_file ] # "--remove-unused-alternates", execute_subprocess(cmd)
def select_pass_variants(raw_vcf, nocall_fr=0.1): """ Filter a vcf file. Output a vcf file with PASS positions adding a .pass to the output file Used since it creates the neccesasary vcf index https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php https://gatkforums.broadinstitute.org/gatk/discussion/13127/do-gatk4-tools-ignore-vcf-sites-marked-as-filtered-or-must-they-be-removed-from-the-file """ #max_nocall=2, input_vcf = os.path.abspath(raw_vcf) check_file_exists(input_vcf) raw_vcf_file_name = (".").join(input_vcf.split(".")[:-1]) extension = ".pass.vcf" vcf_selected_output_file = raw_vcf_file_name + extension cmd = [ "gatk", "SelectVariants", "--variant", input_vcf, "--max-nocall-fraction", str(nocall_fr), "--exclude-filtered", "--remove-unused-alternates", "--output", vcf_selected_output_file ] #"--max-nocall-number", str(max_nocall), execute_subprocess(cmd)
def make_blast(query_fasta, database, sample, output_folder, db_type="nucl", query_type="nucl", evalue=0.0001, threads=8): blast_command = 'blastn' if query_type == "nucl" else 'blastp' database_name = database.split("/")[-1].split(".")[0] output_database_tmp = os.path.join(output_folder, database_name + ".blast.tmp") output_blast = os.path.join(output_folder, sample + "." + database_name + ".blast") blastdb_cmd = [ 'makeblastdb', '-in', database, '-out', output_database_tmp, '-dbtype', db_type ] logger.info((',').join(blastdb_cmd)) execute_subprocess(blastdb_cmd) blast_cmd = [ blast_command, "-query", query_fasta, "-db", output_database_tmp, "-out", output_blast, "-evalue", str(evalue), "-num_threads", str(threads), "-outfmt", "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen" ] logger.info((',').join(blast_cmd)) execute_subprocess(blast_cmd)
def bwa_mapping(args): """ #Store output in a file when it is outputted in stdout https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) cmd_index = ["bwa", "index", reference] execute_subprocess(cmd_index) cmd_map = [ "bwa", "mem", "-t", str(args.threads), "-o", output_file, reference, r1, r2 ] execute_subprocess(cmd_map) """
def fastp_trimming(r1, r2, sample, output_dir, threads=6, min_qual=20, window_size=10, min_len=35): check_create_dir(output_dir) output_trimmed_r1 = os.path.join(output_dir, sample + ".trimmed_R1.fastq.gz") output_trimmed_r2 = os.path.join(output_dir, sample + ".trimmed_R2.fastq.gz") html_dir = os.path.join(output_dir, 'html') json_dir = os.path.join(output_dir, 'json') check_create_dir(html_dir) check_create_dir(json_dir) html_file = os.path.join(html_dir, sample + '_fastp.html') json_file = os.path.join(json_dir, sample + '_fastp.json') cmd = [ 'fastp', '--in1', r1, '--in2', r2, '--out1', output_trimmed_r1, '--out2', output_trimmed_r2, '--detect_adapter_for_pe', '--cut_tail', '--cut_window_size', str(window_size), '--cut_mean_quality', str(min_qual), '--length_required', str(min_len), '--json', json_file, '--html', html_file, '--thread', str(threads) ] execute_subprocess(cmd)
def add_SG(args, input_bam, output_bg_sorted): """ @MN00227:45:000H255J3:1:11102:21214:1110 1:N:0:18 @NS500454:48:HKG57BGXX:1:11101:17089:1032 2:N:0:TCCTGAGC+TCTTACGC @NS500454:27:HJJ32BGXX:1:11101:12392:1099 1:N:0:2 @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>: <is filtered>:<control number>:<sample number | barcode1'+barcode2'> ID = Read group identifier {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE} PU = Platform Unit #optional SM = Sample PL = Platform/technology used to produce the read (ILLUMINA, SOLID, LS454, HELICOS and PACBIO) LB = DNA preparation library identifier """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) with gzip.open(r1) as f: first_line = f.readline().strip().decode() #print(first_line) first_line_list = first_line.split(":") rg_id = ".".join( [first_line_list[2], first_line_list[3], first_line_list[-1]]) rg_pu = ".".join( [first_line_list[2], first_line_list[3], first_line_list[-1]]) rg_sm = sample rg_pl = "ILLUMINA" rg_lb = "lib_" + sample rg_id_param = "RGID=" + rg_id rg_pu_param = "RGPU=" + rg_pu rg_sm_param = "RGSM=" + rg_sm rg_pl_param = "RGPL=" + rg_pl rg_lb_param = "RGLB=" + rg_lb picard_jar = get_picard_path() input_param = "INPUT=" + input_bam output_param = "OUTPUT=" + output_bg_sorted # java -jar picard.jar AddOrReplaceReadGroups \ # INPUT=reads.bam \ OUTPUT=reads_addRG.bam \ RGID=H0164.2 \ #be sure to change from default of 1 # RGLB= library1 \ RGPL=illumina \ RGPU=H0164ALXX140820.2 \ RGSM=sample1 \ # SORT_ORDER=coordinate \ CREATE_INDEX=true cmd = [ "java", "-jar", picard_jar, "AddOrReplaceReadGroups", input_param, output_param, rg_id_param, rg_lb_param, rg_pl_param, rg_pu_param, rg_sm_param, "SORT_ORDER=coordinate" ] execute_subprocess(cmd)
def samtools_faidx(args): #samtools faidx reference.fa input_reference = os.path.abspath(args.reference) fai_file_name = input_reference + ".fai" if os.path.exists(fai_file_name): logger.info(fai_file_name + " already EXIST") else: cmd = ["samtools", "faidx", input_reference] execute_subprocess(cmd)
def hard_filter(selected_vcf, select_type='SNP'): """ https://software.broadinstitute.org/gatk/documentation/article.php?id=6925 https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php https://software.broadinstitute.org/gatk/documentation/article?id=23216 SNP: gatk VariantFiltration -V snps.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" \ "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" "--filter-expression", "SOR > 3.0", "--filter-name", "SOR3" "--filter-expression", "FS > 60.0", "--filter-name", "FS60" \ "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40" "--filter-expression", "MQRankSum < -12.5", "--filter-name", "MQRankSum-12.5" "--filter-expression", "ReadPosRankSum < -8.0" \ , "--filter-name", "ReadPosRankSum-8" -O snps_filtered.vcf.gz INDEL: gatk VariantFiltration -V indels.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" \ -"--filter-expression", "FS > 200.0", "--filter-name", "FS200" -"--filter-expression", "ReadPosRankSum < -20.0", "--filter-name", "ReadPosRankSum-20" -O indels_filtered.vcf.gz #--filterExpression "QD<2.0||FS>60.0||MQ<40.0||MQRankSum<-12.5||ReadPosRankSum<-8.0" --filterName "my_snp_filter" """ input_vcf = os.path.abspath(selected_vcf) check_file_exists(input_vcf) selected_vcf_file_name = (".").join(input_vcf.split(".")[:-2]) if select_type == "SNP": extension = ".snp.hf.vcf" vcf_hard_filtered_output_file = selected_vcf_file_name + extension cmd = [ "gatk", "VariantFiltration", "--variant", input_vcf, "--filter-expression", "QD < 2.0", "--filter-name", "QD2", "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30", "--filter-expression", "SOR > 3.5", "--filter-name", "SOR3", "--filter-expression", "FS > 60.0", "--filter-name", "FS60", "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40", "--filter-expression", "DP < 10", "--filter-name", "DP10", "--filter-expression", "MQRankSum < -12.5", "--filter-name", "MQRankSum-12.5", "--filter-expression", "ReadPosRankSum < -8.0", "--filter-name", "ReadPosRankSum-8", "--output", vcf_hard_filtered_output_file ] elif select_type == "INDEL": extension = ".indel.hf.vcf" vcf_hard_filtered_output_file = selected_vcf_file_name + extension cmd = [ "gatk", "VariantFiltration", "--variant", input_vcf, "--filter-expression", "QD < 2.0", "--filter-name", "QD2", "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30", "--filter-expression", "SOR > 10.0", "--filter-name", "SOR10", "--filter-expression", "FS > 200.0", "--filter-name", "FS200", "--filter-expression", "ReadPosRankSum < -20.0", "--filter-name", "ReadPosRankSum-20", "--output", vcf_hard_filtered_output_file ] else: print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING) execute_subprocess(cmd)
def annotate_pangolin(input_file, output_folder, output_filename, threads=8, max_ambig=0.6): cmd = [ "pangolin", input_file, "--outdir", output_folder, "--outfile", output_filename, "--threads", str(threads), "--max-ambig", str(max_ambig) ] execute_subprocess(cmd)
def combine_gvcf(args, recalibrate=False, all_gvcf=False): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php #combined multi-sample gVCF: gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = group_name + ".cohort.g.vcf" gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file) check_create_dir(gvcf_input_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference", input_reference, "--output", gvcf_output_full ] for root, _, files in os.walk(gvcf_input_dir): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) if all_gvcf != False: if os.path.isdir(all_gvcf): all_gvcf = os.path.abspath(all_gvcf) print("Using gvcf from enricment folder:" + all_gvcf) for root, _, files in os.walk(all_gvcf): for name in files: filename = os.path.join(root, name) if filename.endswith(".g.vcf"): cmd.append("--variant") cmd.append(filename) else: print("GVCF enrichment folder does not exist") execute_subprocess(cmd)
def run_snippy_core(input_dir, output_dir, reference, filter_sample=[]): samples_snippy = [] output_dir = output_dir + "/core" for root, dirs, files in os.walk(input_dir): for name in dirs: if root == input_dir and not name in filter_sample: foldername = os.path.join(root, name) samples_snippy.append(foldername) elif root == input_dir and name in filter_sample: logger.debug(name + " discarded from core FAULTY") cmd = ["snippy-core", "-p", output_dir, "--ref", reference ] + samples_snippy execute_subprocess(cmd)
def call_variants(args, recalibrate=False, group=True): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php #Call variants: gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz """ output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) if not args.sample: args.sample = "nosample" file_name = args.sample #sample_name group_name = output.split("/")[-1] #group_name if recalibrate: gvcf_input_dir = obtain_output_dir(args, "GVCF_recal") vcf_output_dir = obtain_output_dir(args, "VCF_recal") else: gvcf_input_dir = obtain_output_dir(args, "GVCF") vcf_output_dir = obtain_output_dir(args, "VCF") if group: gvcf_input_file = group_name + ".cohort.g.vcf" vcf_output_file = group_name + ".cohort.raw.vcf" else: gvcf_input_file = file_name + ".g.vcf" vcf_output_file = file_name + ".raw.vcf" gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file) vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file) check_create_dir(gvcf_input_dir) check_create_dir(vcf_output_dir) memory_param = "-Xmx" + str(args.memory) + "g" cmd = [ "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference", input_reference, "--variant", gvcf_input_full, "--output", vcf_output_full ] execute_subprocess(cmd)
def refseq_masher(r1_file, r2_file, output_file, threads=16, max_results=50): """ refseq_masher contains --top-n-results 50 -p 16 -o HPR3641322-50.contains2.tsv \ HPR3641322-50_S27_L000_R1_001.fastq.gz HPR3641322-50_S27_L000_R2_001.fastq.gz """ r1 = os.path.abspath(r1_file) r2 = os.path.abspath(r2_file) output_file = os.path.abspath(output_file) cmd = [ "refseq_masher", "contains", "--top-n-results", str(max_results), "-p", str(threads), "-o", output_file, r1, r2 ] execute_subprocess(cmd)
def picard_dictionary(args): #java -jar picard.jar CreateSequenceDictionary\ # R=reference.fasta O=reference.dict #picard_jar = get_picard_path() input_reference = os.path.abspath(args.reference) ref_param = "R=" + input_reference path_file_list = input_reference.split(".")[:-1] path_file_name = ".".join(path_file_list) dict_file_name = path_file_name + ".dict" out_param = "O=" + dict_file_name if os.path.exists(dict_file_name): logger.info(dict_file_name + " already EXIST") else: cmd = ["picard", "CreateSequenceDictionary", ref_param, out_param] execute_subprocess(cmd)
def sam_to_index_bam(sample, output_dir, r1, threads): # input_sam_path = os.path.abspath(input_sam) # if output_bam == "inputdir": # output_bam = os.path.dirname(input_sam_path) # else: # output_bam = output_bam sample_name = sample + ".sam" input_sam_path = os.path.join(output_dir, sample_name) input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1]) output_bam_name = input_name + ".bam" output_bam_path = os.path.join(output_dir, output_bam_name) output_sorted_name = input_name + ".sorted.bam" output_sorted_path = os.path.join(output_dir, output_sorted_name) output_bg_sorted_name = input_name + ".rg.sorted.bam" output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name) cmd_view = [ "samtools", "view", "-Sb", input_sam_path, "--threads", str(threads), "-o", output_bam_path, ] execute_subprocess(cmd_view) check_remove_file(input_sam_path) cmd_sort = ["samtools", "sort", output_bam_path, "-o", output_sorted_path] execute_subprocess(cmd_sort) check_remove_file(output_bam_path) add_SG(sample, output_sorted_path, output_bg_sorted_path, r1) check_remove_file(output_sorted_path) """
def sam_to_index_bam(args): # input_sam_path = os.path.abspath(input_sam) # if output_bam == "inputdir": # output_bam = os.path.dirname(input_sam_path) # else: # output_bam = output_bam r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" input_sam_path = os.path.join(output_dir, sample_name) input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1]) output_bam_name = input_name + ".bam" output_bam_path = os.path.join(output_dir, output_bam_name) output_bg_sorted_name = input_name + ".rg.sorted.bam" output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name) check_create_dir(output_dir) """ #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam with open(output_bam_path, "w") as outfile: #map reads and save it in th eoutput file subprocess.run(["samtools", "view", "-Sb", input_sam_path], stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True) """ cmd = [ "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path, "--threads", str(args.threads) ] execute_subprocess(cmd) check_remove_file(input_sam_path) add_SG(args, output_bam_path, output_bg_sorted_path) check_remove_file(output_bam_path) """
def ivar_consensus(input_bam, output_consensus, sample, min_quality=20, min_frequency_threshold=0.8, min_depth=20, uncovered_character='N'): """ ivar consensus Usage: samtools mpileup -aa -A -d 0 -Q 0 <input.bam> | ivar consensus -p <prefix> Note : samtools mpileup output must be piped into ivar consensus Input Options Description -q Minimum quality score threshold to count base (Default: 20) -t Minimum frequency threshold(0 - 1) to call consensus. (Default: 0) Frequently used thresholds | Description ---------------------------|------------ 0 | Majority or most common base 0.2 | Bases that make up atleast 20% of the depth at a position 0.5 | Strict or bases that make up atleast 50% of the depth at a position 0.9 | Strict or bases that make up atleast 90% of the depth at a position 1 | Identical or bases that make up 100% of the depth at a position. Will have highest ambiguities -m Minimum depth to call consensus(Default: 10) -k If '-k' flag is added, regions with depth less than minimum depth will not be added to the consensus sequence. Using '-k' will override any option specified using -n -n (N/-) Character to print in regions with less than minimum coverage(Default: N) Output Options Description -p (Required) Prefix for the output fasta file and quality file """ prefix = output_consensus + '/' + sample input = { 'input_bam': input_bam, 'prefix': prefix, 'min_quality': str(min_quality), 'min_frequency_threshold': str(min_frequency_threshold), 'min_depth': str(min_depth), 'uncovered_character': uncovered_character } cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 {input_bam} | \ ivar consensus -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -n {uncovered_character}".format( **input) execute_subprocess(cmd, isShell=True)
def picard_markdup(args): #java -jar picard.jar MarkDuplicates \ # I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt picard_jar = get_picard_path() input_bam = os.path.abspath(args.input_bam) in_param = "I=" + input_bam path_file_name = input_bam.split(".")[0] file_name = path_file_name.split("/")[-1] output_markdup = path_file_name + ".rg.markdup.bam" output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam" out_param = "O=" + output_markdup stat_output_dir = obtain_output_dir(args, "Stats") stat_output_file = file_name + ".markdup.metrics.txt" stat_output_full = os.path.join(stat_output_dir, stat_output_file) stats_param = "M=" + stat_output_full check_create_dir(stat_output_dir) cmd_markdup = [ "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param, stats_param ] execute_subprocess(cmd_markdup) #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam" cmd_sort = [ "samtools", "sort", output_markdup, "-o", output_markdup_sorted ] execute_subprocess(cmd_sort) #Handled in Haplotype Caller function #samtools index: samtools index $output_dir/$sample".sorted.bam" subprocess.run(["samtools", "index", output_markdup_sorted], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) check_remove_file(input_bam) check_remove_file(output_markdup)
def ivar_variants(reference, input_bam, output_variant, sample, annotation, min_quality=20, min_frequency_threshold=0.8, min_depth=20): """ Usage: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file] Note : samtools mpileup output must be piped into ivar variants Input Options Description -q Minimum quality score threshold to count base (Default: 20) -t Minimum frequency threshold(0 - 1) to call variants (Default: 0.03) -m Minimum read depth to call variants (Default: 0) -r Reference file used for alignment. This is used to translate the nucleotide sequences and identify intra host single nucleotide variants -g A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs). In absence of GFF file, amino acid translation will not be done. Output Options Description -p (Required) Prefix for the output tsv variant file """ ivar_folder = os.path.join(output_variant, 'ivar_raw') check_create_dir(ivar_folder) prefix = ivar_folder + '/' + sample input = { 'reference': reference, 'input_bam': input_bam, 'prefix': prefix, 'min_quality': str(min_quality), 'min_frequency_threshold': str(min_frequency_threshold), 'min_depth': str(min_depth), 'annotation': annotation } cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 --reference {reference} {input_bam} | \ ivar variants -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -r {reference} -g {annotation}".format( **input) execute_subprocess(cmd, isShell=True)
def run_snippy(r1, r2, reference, output_dir, sample, threads=16, minqual=20, minfrac=0.1, mincov=1): """ snippy --cpus 16 --outdir mysnps --ref Listeria.gbk --R1 FDA_R1.fastq.gz --R2 FDA_R2.fastq.gz """ prefix = os.path.join(output_dir, sample) cmd = [ "snippy", "--cpus", str(threads), "--outdir", prefix, "--minqual", str(minqual), "--mincov", str(mincov), "--minfrac", str(minfrac), "--ref", reference, "--R1", r1, "--R2", r2 ] execute_subprocess(cmd)
def split_vcf_saples(vcf_file, sample_list=False, nocall_fr=0.1): """ https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php https://www.biostars.org/p/224702/ #TODO: check if argument --exclude-filtered is suitable here. It would save select_pass_variants() step """ if sample_list == False: #samples = subprocess.run(["bcftools", "query", "-l", vcf_file],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True) #sample_list = samples.stdout.split("\n")[:-1] sample_list = samples_from_vcf(vcf_file) else: sample_list = sample_list vcf_file_path = os.path.abspath(vcf_file) vcf_dir_name = os.path.dirname(vcf_file) vcf_file_name = vcf_file_path.split("/")[-1] vcf_file_extension = (".").join(vcf_file_name.split(".")[2:]) for sample_name in sample_list: output_vcf_name = sample_name + "." + vcf_file_extension output_vcf_file = os.path.join(vcf_dir_name, output_vcf_name) cmd = [ "gatk", "SelectVariants", "--max-nocall-fraction", str(nocall_fr), "--variant", vcf_file, "--sample-name", sample_name, "--exclude-non-variants", "--output", output_vcf_file ] #"--exclude-non-variants", #remove non genotyped variants #"--remove-unused-alternates", #avoid poblational polymorphism #--preserve-alleles #"--keep-original-dp", #"--keep-original-ac", #"--select-type-to-include", "SNP", #"--select-type-to-include", "MIXED", if not os.path.isfile(output_vcf_file): execute_subprocess(cmd)
def ivar_trim(input_bam, primers_file, sample, min_length=30, min_quality=20, sliding_window_width=4): """ Usage: ivar trim -i <input.bam> -b <primers.bed> -p <prefix> [-m <min-length>] [-q <min-quality>] [-s <sliding-window-width>] Input Options Description -i (Required) Sorted bam file, with aligned reads, to trim primers and quality -b (Required) BED file with primer sequences and positions -m Minimum length of read to retain after trimming (Default: 30) -q Minimum quality threshold for sliding window to pass (Default: 20) -s Width of sliding window (Default: 4) -e Include reads with no primers. By default, reads with no primers are excluded Output Options Description -p (Required) Prefix for the output BAM file """ input_bam = os.path.abspath(input_bam) input_bai = input_bam + ".bai" primers_file = os.path.abspath(primers_file) prefix = input_bam.split('.')[0] + ".rg.markdup.trimmed" output_trimmed_bam = prefix + ".bam" output_trimmed_sorted_bam = input_bam.split( '.')[0] + ".rg.markdup.trimmed.sorted.bam" cmd = [ "ivar", "trim", "-i", input_bam, "-b", primers_file, "-p", prefix, "-m", str(min_length), "-q", str(min_quality), "-s", str(sliding_window_width), "-e" ] execute_subprocess(cmd) check_remove_file(input_bam) cmd_sort = [ "samtools", "sort", output_trimmed_bam, "-o", output_trimmed_sorted_bam ] execute_subprocess(cmd_sort) check_remove_file(output_trimmed_bam) cmd_index = ["samtools", "index", output_trimmed_sorted_bam] execute_subprocess(cmd_index) check_remove_file(input_bai)
def fastqc_quality(r1, r2, output_dir, threads=8): check_create_dir(output_dir) cmd = ['fastqc', r1, r2, '-o', output_dir, '--threads', str(threads)] execute_subprocess(cmd)
def create_coverage(input_bam, output_dir, sample): output_file = os.path.join(output_dir, sample + ".cov") cmd = "samtools depth -aa {} > {}".format(input_bam, output_file) execute_subprocess(cmd, isShell=True)
def create_bamstat(input_bam, output_dir, sample, threads=8): output_file = os.path.join(output_dir, sample + ".bamstats") cmd = "samtools flagstat --threads {} {} > {}".format( str(threads), input_bam, output_file) execute_subprocess(cmd, isShell=True)
def haplotype_caller(args, recalibrate=False, ploidy=2, bamout=False, forceactive=False, intervals=False): #base_quality=13, """ #No excuses https://software.broadinstitute.org/gatk/documentation/article?id=11081 """ #input_bam = os.path.abspath(args.input_bam) input_reference = os.path.abspath(args.reference) bam_output_dir = obtain_output_dir(args, "Bam") #file_name = path_file_name.split("/")[-1] #sample_name file_name = args.sample #path_file_name = os.path.join(output_dir, gvcf_output_file) if recalibrate: input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF_recal") gvcf_output_file = file_name + ".g.vcf" else: input_bam_to_call_name = file_name + ".bqsr.bam" gvcf_output_dir = obtain_output_dir(args, "GVCF") gvcf_output_file = file_name + ".g.vcf" check_create_dir(gvcf_output_dir) input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name) gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file) memory_param = "-Xmx" + str(args.memory) + "g" hc_args = [ "gatk", "HaplotypeCaller", "--java-options", memory_param, "--reference", input_reference, "--input", input_bam_to_call, "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF", "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy", str(ploidy) ] #"--min-base-quality-score", str(base_quality), #Create bam index #cmd_index = ["samtools", "index", input_bam_to_call] #execute_subprocess(cmd_index) if bamout: bamout_output_dir = obtain_output_dir(args, "Bamout") bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam" bamout_output_full = os.path.join(bamout_output_dir, bamout_output_file) check_create_dir(bamout_output_dir) bamout_params = ["--bam-output", bamout_output_full] hc_args.extend(bamout_params) if forceactive: force_params = ["--force-active", "--disable-optimizations"] hc_args.extend(force_params) execute_subprocess(hc_args) """
def recalibrate_bam(args, tb=False): """ BaseRecalibrator https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php #Recalibrate bam: gatk BaseRecalibrator --input my_reads.bam --reference reference.fasta --known-sites sites_of_variation.vcf \ --known-sites another/optional/setOfSitesToMask.vcf --output recal_data.table ApplyBQSR https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php gatk ApplyBQSR --reference reference.fasta --input input.bam --bqsr-recal-file recalibration.table --output output.bam """ #output = os.path.abspath(args.output) input_reference = os.path.abspath(args.reference) #Automate M. tuberculosis reference for aditional recalibraion positions if ("NC_000962.3" in input_reference) or ( "h37rv" in input_reference.lower()) or ("ancestor" in input_reference): tb = True script_dir = os.path.dirname(os.path.realpath(__file__)) reference_dir = os.path.join(script_dir, "reference") if ("NC_000962.3" in input_reference) or ("h37rv" in input_reference.lower()): reference_file = os.path.join( reference_dir, "190508_ddtb.NC_000962.3.BQSR.table") elif ("ancestor" in input_reference): reference_file = os.path.join(reference_dir, "190508_ddtb.BQSR.table") #group_name = output.split("/")[-1] #group_name sample_name = args.sample bam_input_dir = obtain_output_dir(args, "Bam") vcf_input_dir = obtain_output_dir(args, "VCF_recal") bam_input_file_name = sample_name + ".rg.markdup.sorted.bam" bam_input_file = os.path.join(bam_input_dir, bam_input_file_name) table_output_file_name = sample_name + ".recall.table" table_output_file = os.path.join(vcf_input_dir, table_output_file_name) memory_param = "-Xmx" + str(args.memory) + "g" #BaseRecalibrator cmd_bqsr = [ "gatk", "BaseRecalibrator", "--java-options", memory_param, "--reference", input_reference, "--input", bam_input_file, "--output", table_output_file ] if tb == True: cmd_bqsr.append("--known-sites") cmd_bqsr.append(reference_file) for root, _, files in os.walk(vcf_input_dir): for name in files: filename = os.path.join(root, name) if filename.endswith(".hf.pass.vcf"): cmd_bqsr.append("--known-sites") cmd_bqsr.append(filename) execute_subprocess(cmd_bqsr) #ApplyBQSR bam_output_file_name = sample_name + ".bqsr.bam" bam_output_file = os.path.join(bam_input_dir, bam_output_file_name) cmd_apply = [ "gatk", "ApplyBQSR", "--reference", input_reference, "--input", bam_input_file, "--bqsr-recal-file", table_output_file, "--output", bam_output_file ] execute_subprocess(cmd_apply)