import os import argparse from RouToolPa.Tools.Filter import FaCut #from RouToolPa.Tools.Filter import FastQC from RouToolPa.Routines import FileRoutines parser = argparse.ArgumentParser() parser.add_argument("-d", "--sample_directory", action="store", dest="samples_dir", required=True, type=lambda s: FileRoutines.check_path(os.path.abspath(s)), help="Directory with samples") parser.add_argument( "-s", "--samples", action="store", dest="samples", help="Comma-separated list of subdirectories(one per sample) to handle. " "If not set all subdirectories will be considered as containing samples." "In sample directory should one(in case SE reads) or two(in case PE reads) files." "Filenames should should contain '_1.fq' or '_1.fastq' for forward(left) reads, " " '_2.fq' or '_2.fastq' for reverse(right) reads and '.fq' or '.fastq' for SE reads" ) parser.add_argument( "-o", "--output_dir",
dest="indel_ReadPosRankSum", type=float, default=-20.0, help="Indel ReadPosRankSum threshold. Default - -20.0") #parser.add_argument("--indel_InbreedingCoeff", action="store", dest="indel_InbreedingCoeff", type=float, default=-0.8, # help="Indel InbreedingCoeff threshold. Default - -0.8") parser.add_argument("--indel_FS", action="store", dest="indel_FS", type=float, default=200.0, help="Indel FS threshold. Default - 200.0") args = parser.parse_args() VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir) VariantFiltration.filter_bad_variants( args.reference, args.input_vcf, args.output_prefix, snp_filter_name=args.snp_filter_name, snp_QD=args.snp_QD, snp_FS=args.snp_FS, snp_MQ=args.snp_MQ, #snp_HaplotypeScore=args.snp_HaplotypeScore, snp_MappingQualityRankSum=args.snp_MappingQualityRankSum, snp_ReadPosRankSum=args.snp_ReadPosRankSum, indel_filter_name=args.indel_filter_name, indel_QD=args.indel_QD, indel_ReadPosRankSum=args.indel_ReadPosRankSum,
white_list = IdList() if args.black_list_file: black_list.read(args.black_list_file) if args.white_list_file: white_list.read(args.white_list_file) out_fd = open(args.cafe_file, "w") filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w") out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) species_filtered_fd_list = OrderedDict() fam_count_dict = TwoLvlDict() species_family_dict = TwoLvlDict() for species in args.species_set: species_family_dict[species] = SynDict() species_family_dict[species].read( "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix), split_values=True, values_separator=",", separator="\t") #print species_family_dict[species] fam_count_dict[species] = species_family_dict[species].count_synonyms() #print fam_count_dict[species] species_filtered_fd_list[species] = open( "%s%s.fam" % (args.filtered_family_dir, species), "w") for family in fam_count_dict.sl_keys(): genes_number_list = [] number_of_species = 0 for species in species_list: genes_number_list.append(fam_count_dict[species][family] if family in fam_count_dict[species] else 0)
action="store", dest="max_memory_per_thread", default="1G", help="Maximum memory per thread. Default - 1G") args = parser.parse_args() if args.prepare_bam and ((not args.prepared_bam_prefix) or (not args.temp_dir)): raise ValueError( "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used" ) SamtoolsV1.threads = args.threads if args.prepare_bam or args.mix_ends: FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir)) prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix prepared_unpaired_bam_file = ( "%s.unpaired.bam" % args.prepared_bam_prefix) if args.mix_ends else None """ SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread) """ SamtoolsV1.prepare_bam_for_read_extraction( args.input, prepared_pe_bam_file, temp_file_prefix=args.temp_dir, max_memory_per_thread=args.max_memory_per_thread, bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file) if args.paired:
def snp_call_GATK(alignment, sample_name, reference_file, known_sites_vcf, stand_emit_conf=40, stand_call_conf=100, QD=2.0, FS=60.0, MQ=40.0, HaplotypeScore=13.0, MappingQualityRankSum=-12.5, ReadPosRankSum=-8.0, GATK_dir="", num_of_threads=5, skip_base_recalibration=False): #default filter expression #"QD < 2.0 || FS > 60.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0" gatk_dir = FileRoutines.check_path(GATK_dir) intermediate_alignment = alignment if not skip_base_recalibration: intermediate_alignment = alignment + "_recal_reads.bam" #Analyze patterns of covariation in the sequence dataset os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T BaseRecalibrator -R %s -I %s -knownSites %s -o %s_recal_data.table" % (gatk_dir, num_of_threads, reference_file, alignment, known_sites_vcf, sample_name)) #Do a second pass to analyze covariation remaining after recalibration os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T BaseRecalibrator -R %s -I %s -knownSites %s -BQSR %s_recal_data.table -o %s_post_recal_data.table" % (gatk_dir, num_of_threads, reference_file, alignment, known_sites_vcf, sample_name, sample_name)) #Generate before/after plots #os.system("java -jar %sGenomeAnalysisTK.jar -T AnalyzeCovariates -R %s -before %s_recal_data.table -after %s_post_recal_data.table -plots %s_recalibration_plots.pdf" # % (gatk_dir, reference_file, sample_name, sample_name, sample_name)) #Apply the recalibration to your sequence data os.system( "java -jar %sGenomeAnalysisTK.jar -nct %i -T PrintReads -R %s -I %s -BQSR %s_recal_data.table -o %s" % (gatk_dir, num_of_threads, reference_file, alignment, sample_name, intermediate_alignment)) print("\nSNP call...\n") #SNP call os.system( " java -jar %sGenomeAnalysisTK.jar -nt %i -l INFO -R %s -T UnifiedGenotyper -I %s -stand_call_conf %i -stand_emit_conf %i -o %s_GATK_raw.vcf --output_mode EMIT_VARIANTS_ONLY" % (gatk_dir, num_of_threads, reference_file, intermediate_alignment, stand_call_conf, stand_emit_conf, sample_name)) #extract SNP os.system( "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType SNP -o %s_GATK_raw_no_indel.vcf" % (gatk_dir, reference_file, sample_name, sample_name)) #extract indels os.system( "java -jar %sGenomeAnalysisTK.jar -T SelectVariants -R %s -V %s_GATK_raw.vcf -selectType INDEL -o %s_GATK_raw_only_indel.vcf" % (gatk_dir, reference_file, sample_name, sample_name)) #filtering print("\nFiltering SNP...\n") os.system( "java -jar %sGenomeAnalysisTK.jar -T VariantFiltration -R %s -V %s_GATK_raw_no_indel.vcf --filterExpression 'QD < %f || FS > %f || MQ < %f || HaplotypeScore > %f || MappingQualityRankSum < %f || ReadPosRankSum < %f' --filterName 'ambigious_snp' -o %s_GATK_filtered_snps.vcf " % (gatk_dir, reference_file, sample_name, QD, FS, MQ, HaplotypeScore, MappingQualityRankSum, ReadPosRankSum, sample_name)) #os.system("vcftools --vcf %s_GATK_filtered_snps.vcf --remove-filtered-all --out %s_GATK_best_snps.vcf --recode --recode-INFO-all" # % (sample_name, sample_name )) """
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Tools.GATK import SelectVariants from RouToolPa.Routines import FileRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_vcf", action="store", dest="input_vcf", required=True, help="Input vcf file") parser.add_argument("-o", "--output_vcf", action="store", dest="output_vcf", required=True, help="Output vcf file") parser.add_argument("-r", "--reference", action="store", dest="reference", required=True, help="Fasta with reference genome") parser.add_argument("-g", "--gatk_directory", action="store", dest="gatk_dir", default="", help="Directory with GATK jar") args = parser.parse_args() SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir) SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf, args.output_vcf)