def variant_call(self, alignment, reference_file, stand_emit_conf=40, stand_call_conf=100, GATK_dir="", num_of_threads=5, output_mode="EMIT_VARIANTS_ONLY", discovery_mode="BOTH", output_file="GATK_raw.vcf", default_base_qualities=None): # manual http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.html # output_mode values: # EMIT_VARIANTS_ONLY # EMIT_ALL_CONFIDENT_SITES # EMIT_ALL_SITES # discovery_mode values: # SNP # INDEL # GENERALPLOIDYSNP # GENERALPLOIDYINDEL # BOTH default_qualities = "" if default_base_qualities: default_qualities = "--defaultBaseQualities %i" % default_base_qualities gatk_dir = check_path(GATK_dir) os.system( " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i -o %s --output_mode %s -glm %s %s" % (gatk_dir, num_of_threads, reference_file, alignment, stand_call_conf, stand_emit_conf, output_file, output_mode, discovery_mode, default_qualities))
def RepeatModeler_search(query_file, db_name, output_file="run.out", num_of_threads=5, RepeatModeler_dir=""): print("\nRepeatModeler search...\n") repmod_dir = check_path(RepeatModeler_dir) os.system(repmod_dir + "BuildDatabase -engine ncbi -name %s %s" % (db_name, query_file)) os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s" % (num_of_threads, db_name, output_file))
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80, PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""): print("\nTRF search...\n") #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options] #Where: (all weights, penalties, and scores are positive) # File = sequences input file # Match = matching weight # Mismatch = mismatching penalty # Delta = indel penalty # PM = match probability (whole number) # PI = indel probability (whole number) # Minscore = minimum alignment score to report # MaxPeriod = maximum period size to report # [options] = one or more of the following : # -m masked sequence file # -f flanking sequence # -d data file # -h suppress HTML output #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m flanking = "" if flanked: flanking = "-f" trf_path = check_path(TRF_dir) os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m" % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
def add_header2bam(input_bam, output_bam, RGID, RGLB, RGPL, RGSM, RGPU, PICARD_dir=""): picard_dir = check_path(PICARD_dir) os.system( "java -XX:MaxDirectMemorySize=4G -jar %sAddOrReplaceReadGroups.jar I= %s O= %s SORT_ORDER=coordinate RGID=%s RGLB=%s RGPL=%s RGSM=%s RGPU=%s CREATE_INDEX=True" % (picard_dir, input_bam, output_bam, RGID, RGLB, RGPL, RGSM, RGPU))
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="", num_of_threads=5, search_type="-s"): #species: see list of possible species in repeatmasker.help coming with RepeatMasker #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast) repmask_dir = check_path(RepeatMasker_dir) custom_lib = "" if custom_lib_path: cuatom_lib = "-lib %s" % custom_lib_path #additional options: #-xm creates an additional output file in cross_match format (for parsing) #-ace creates an additional output file in ACeDB format #-gff creates an additional Gene Feature Finding format #-excln The percentages displayed in the .tbl file are calculated using a # total sequence length excluding runs of 25 Ns or more. print("\nRepeatMasker search...\n") os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s" % (custom_lib, num_of_threads, species, search_type, query_file))
for filename in dir_list: if ("_R1" in filename) and (sample_name in filename): left_reads = filename if ("_R2" in filename) and (sample_name in filename): right_reads = filename return left_reads, right_reads reference_name = "LAN210_v0.10m" reference_dir = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/" reference = "%s%s.fasta" % (reference_dir, reference_name) reference_dict = "%s%s.dict" % (reference_dir, reference_name) reference_index = "%s%s" % (reference_dir, reference_name) workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/all" gatk_dir = check_path("/home/mahajrod/Repositories/genetic/NGS_tools/GenomeAnalysisTK-3.2-0/") platform = "illumina" read_subdir = "trimmed/spades/corrected/" #samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_nonwt.t" #samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_wt.t" #samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_nonHAP.t" samples_file = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/samples_HAP.t" alignment_dir = "alignment_LAN210_v0.10m" os.chdir(reference_dir) make_fasta_dict(reference, reference_dict, PICARD_dir="/home/mahajrod/Repositories/genetic/NGS_tools/picard-tools-1.115/picard-tools-1.115/") get_chromosomes_bed(reference, reference_index, mitochondrial_region_name="mt", chrom_out_file="chromosomes.bed", mito_out_file="mt.bed", reference_filetype="fasta") os.system("samtools faidx %s" % reference)
def snp_call_GATK(alignment, sample_name, reference_file, known_sites_vcf, stand_emit_conf=40, stand_call_conf=100, QD=2.0, FS=60.0, MQ=40.0, HaplotypeScore=13.0, MappingQualityRankSum=-12.5, ReadPosRankSum=-8.0, GATK_dir="", num_of_threads=5, skip_base_recalibration=False): #default filter expression #"QD < 2.0 || FS > 60.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0" gatk_dir = check_path(GATK_dir) intermediate_alignment = alignment if not skip_base_recalibration: intermediate_alignment = alignment + "_recal_reads.bam" #Analyze patterns of covariation in the sequence dataset os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T BaseRecalibrator -R %s -I %s -knownSites %s -o %s_recal_data.table" % (gatk_dir, num_of_threads, reference_file, alignment, known_sites_vcf, sample_name)) #Do a second pass to analyze covariation remaining after recalibration os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T BaseRecalibrator -R %s -I %s -knownSites %s -BQSR %s_recal_data.table -o %s_post_recal_data.table" % (gatk_dir, num_of_threads, reference_file, alignment, known_sites_vcf, sample_name, sample_name)) #Generate before/after plots #os.system("java -jar %sGenomeAnalysisTK.jar -T AnalyzeCovariates -R %s -before %s_recal_data.table -after %s_post_recal_data.table -plots %s_recalibration_plots.pdf" # % (gatk_dir, reference_file, sample_name, sample_name, sample_name)) #Apply the recalibration to your sequence data os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T PrintReads -R %s -I %s -BQSR %s_recal_data.table -o %s" % (gatk_dir, num_of_threads, reference_file, alignment, sample_name, intermediate_alignment)) print("\nSNP call...\n") #SNP call os.system( " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i -o %s_GATK_raw.vcf --output_mode EMIT_VARIANTS_ONLY" % (gatk_dir, num_of_threads, reference_file, intermediate_alignment, stand_call_conf, stand_emit_conf, sample_name)) #extract SNP os.system( "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType SNP -o %s_GATK_raw_no_indel.vcf" % (gatk_dir, reference_file, sample_name, sample_name)) #extract indels os.system( "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType INDEL -o %s_GATK_raw_only_indel.vcf" % (gatk_dir, reference_file, sample_name, sample_name)) #filtering print("\nFiltering SNP...\n") os.system( "java -jar %sGenomeAnalysisTK.jar -T VariantFiltration -R %s -V %s_GATK_raw_no_indel.vcf --filterExpression 'QD < %f || FS > %f || MQ < %f || HaplotypeScore > %f || MappingQualityRankSum < %f || ReadPosRankSum < %f' --filterName 'ambigious_snp' -o %s_GATK_filtered_snps.vcf " % (gatk_dir, reference_file, sample_name, QD, FS, MQ, HaplotypeScore, MappingQualityRankSum, ReadPosRankSum, sample_name)) #os.system("vcftools --vcf %s_GATK_filtered_snps.vcf --remove-filtered-all --out %s_GATK_best_snps.vcf --recode --recode-INFO-all" # % (sample_name, sample_name )) """
def windowmasker_search(windowmasker_dir): winmask_dir = check_path(windowmasker_dir) #TODO: write this function pass
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""): repmaskutils_dir = check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""): print("\nExtracting RepBase for %s\n" % species) repmaskutils_dir = check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""): picard_dir = check_path(PICARD_dir) os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" % (picard_dir, fasta_file, dict_name))