def create_data_dir(args, fasta_path, bam_path): print "++Creating data directory for bam2aln processing." data_dir = os.path.join(args.output_dir, "data") if not os.path.exists(data_dir): os.makedirs(data_dir) reference_fasta_path = os.path.join(data_dir, "reference.fasta") if not os.path.exists(reference_fasta_path): shutil.copy2(fasta_path, reference_fasta_path) samtools.faidx(reference_fasta_path) reference_bam_path = os.path.join(data_dir, "reference.bam") if not os.path.exists(reference_bam_path): shutil.copy2(bam_path, reference_bam_path) samtools.index(reference_bam_path)
def do_gatk(args): fasta_path, sorted_bam_path = pipelines.common.ssaha2_alignment(args) #Gatk #Step 3 step_3_dir = os.path.join(args.output_dir, "03_gatk") step_3_file = os.path.join(step_3_dir, "gatk.done") realigned_bam_path = "" if not os.path.exists(step_3_file): print "++Gatk recalibration and realignment of reference alignment file." if not os.path.exists(step_3_dir): os.makedirs(step_3_dir) #Step: Picard: Mark Duplicates. dedup_bam_path = os.path.join(step_3_dir, "dedup.bam") dedup_metrics_path = os.path.join(step_3_dir, "dedup.metrics") picardtools.mark_duplicates(sorted_bam_path, dedup_bam_path, dedup_metrics_path) #Step: Samtools: Index BAM. ***Do after mark duplicates. index_done_file = os.path.join(step_3_dir, "index.done") if not os.path.exists(index_done_file): samtools.index(dedup_bam_path) open(index_done_file, 'w').close() #Step: Gatk Realign. #Gatk: Intervals. intervals_path = gatk.realigner_target_creator(fasta_path, dedup_bam_path) #Gatk: Indel Realign. realigned_bam_path = gatk.indel_realigner(fasta_path, dedup_bam_path, intervals_path) #Step: Gatk Recal. ***May not be able to do due to need for dbSNP file. #CountCovariates. #recal_csv_path = os.path.join(step_3_dir, "recal_data.csv") #gatk.count_covariates(fasta_path, realigned_bam_path, recal_csv_path) ##TableRecalibration. #recal_bam_path = os.path.join(step_3_dir, "recal.bam") #gatk.table_recalibration(fasta_path, realigned_bam_path, recal_csv_path, recal_bam_path) p.dump(realigned_bam_path, open(step_3_file, 'w')) else: realigned_bam_path = p.load(open(step_3_file, 'r')) #Gatk Output #Step 4 output_dir = os.path.join(args.output_dir, "output") raw_vcf_path = os.path.join(output_dir, "output.vcf") gd_path = os.path.join(output_dir, "output.gd") print "++Filtering poor values for SNPs and INDELs in output and converting vcf files to gd." if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(raw_vcf_path): gatk.unified_genotyper(fasta_path, realigned_bam_path, raw_vcf_path, args.glm_option) breseq.command.vcf2gd(raw_vcf_path, gd_path) #Gatk recommended filter values for SNPs and INDELs. snp_filters = ['"QD < 2.0"',\ '"MQ < 40.0"',\ '"FS > 60.0"',\ '"HaplotypeScore > 13.0"',\ '"MQRankSum < -12.5"',\ '"ReadPosRankSum < -8.0"'] indel_filters = ['"QD < 2.0"',\ '"ReadPosRankSum < -20.0"',\ '"InbreedingCoeff < -0.8"',\ '"FS > 200.0"'] snp_gd = os.path.join(output_dir, "SNP.gd") indels_gd = os.path.join(output_dir, "INDELS.gd") breseq.command.genome_diff_filter(snp_gd, gd_path, ["SNP"], snp_filters) breseq.command.genome_diff_filter(indels_gd, gd_path, ["INS", "DEL"], indel_filters) breseq.command.genome_diff_merge([snp_gd, indels_gd] , gd_path) breseq.command.genome_diff_filter(gd_path, gd_path, ["ALL"], ['"AF!=1.00"']) pipelines.common.create_data_dir(args, fasta_path, realigned_bam_path)