Exemple #1
0
def create_data_dir(args, fasta_path, bam_path):
    print "++Creating data directory for bam2aln processing."
    data_dir = os.path.join(args.output_dir, "data")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    reference_fasta_path = os.path.join(data_dir, "reference.fasta")
    if not os.path.exists(reference_fasta_path):
        shutil.copy2(fasta_path, reference_fasta_path)
        samtools.faidx(reference_fasta_path)

    reference_bam_path = os.path.join(data_dir, "reference.bam")
    if not os.path.exists(reference_bam_path):
        shutil.copy2(bam_path, reference_bam_path)
        samtools.index(reference_bam_path)
Exemple #2
0
def do_gatk(args):
    fasta_path, sorted_bam_path = pipelines.common.ssaha2_alignment(args)


    #Gatk
    #Step 3
    step_3_dir = os.path.join(args.output_dir, "03_gatk")
    step_3_file = os.path.join(step_3_dir, "gatk.done")
    realigned_bam_path = ""
    if not os.path.exists(step_3_file):
        print "++Gatk recalibration and realignment of reference alignment file."
        if not os.path.exists(step_3_dir):
            os.makedirs(step_3_dir)
            
        #Step: Picard: Mark Duplicates.
        dedup_bam_path = os.path.join(step_3_dir, "dedup.bam")
        dedup_metrics_path = os.path.join(step_3_dir, "dedup.metrics")
        picardtools.mark_duplicates(sorted_bam_path, dedup_bam_path, dedup_metrics_path)

        #Step: Samtools: Index BAM. ***Do after mark duplicates.
        index_done_file = os.path.join(step_3_dir, "index.done")
        if not os.path.exists(index_done_file):
            samtools.index(dedup_bam_path)
            open(index_done_file, 'w').close()

        #Step: Gatk Realign.
        #Gatk: Intervals.
        intervals_path = gatk.realigner_target_creator(fasta_path, dedup_bam_path)
        #Gatk: Indel Realign.
        realigned_bam_path = gatk.indel_realigner(fasta_path, dedup_bam_path, intervals_path)

        #Step: Gatk Recal. ***May not be able to do due to need for dbSNP file.
        #CountCovariates.
        #recal_csv_path = os.path.join(step_3_dir, "recal_data.csv")
        #gatk.count_covariates(fasta_path, realigned_bam_path, recal_csv_path)
        ##TableRecalibration.
        #recal_bam_path = os.path.join(step_3_dir, "recal.bam")
        #gatk.table_recalibration(fasta_path, realigned_bam_path, recal_csv_path, recal_bam_path)
        
        p.dump(realigned_bam_path, open(step_3_file, 'w'))
    else:
        realigned_bam_path = p.load(open(step_3_file, 'r'))
        
    #Gatk Output
    #Step 4
    output_dir = os.path.join(args.output_dir, "output")
    raw_vcf_path = os.path.join(output_dir, "output.vcf")
    gd_path = os.path.join(output_dir, "output.gd")
    print "++Filtering poor values for SNPs and INDELs in output and converting vcf files to gd."

    if not os.path.exists(output_dir): os.makedirs(output_dir)
    
    if not os.path.exists(raw_vcf_path):
        gatk.unified_genotyper(fasta_path, realigned_bam_path, raw_vcf_path, args.glm_option)

    breseq.command.vcf2gd(raw_vcf_path, gd_path)
    
    #Gatk recommended filter values for SNPs and INDELs.
    snp_filters = ['"QD < 2.0"',\
                   '"MQ < 40.0"',\
                   '"FS > 60.0"',\
                   '"HaplotypeScore > 13.0"',\
                   '"MQRankSum < -12.5"',\
                   '"ReadPosRankSum < -8.0"']

    indel_filters = ['"QD < 2.0"',\
                     '"ReadPosRankSum < -20.0"',\
                     '"InbreedingCoeff < -0.8"',\
                     '"FS > 200.0"']

    snp_gd = os.path.join(output_dir, "SNP.gd")
    indels_gd = os.path.join(output_dir, "INDELS.gd")

    breseq.command.genome_diff_filter(snp_gd, gd_path, ["SNP"], snp_filters)
    breseq.command.genome_diff_filter(indels_gd, gd_path, ["INS", "DEL"], indel_filters)

    breseq.command.genome_diff_merge([snp_gd, indels_gd] , gd_path)
    breseq.command.genome_diff_filter(gd_path, gd_path, ["ALL"], ['"AF!=1.00"'])


    pipelines.common.create_data_dir(args, fasta_path, realigned_bam_path)