Beispiel #1
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    PipelineExome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    PipelineExome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    IOTools.zap_file(outfile1)

    PipelineExome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    IOTools.zap_file(outfile2)
Beispiel #2
0
def buildCoverageStats(infile, outfile):
    '''Generate coverage statistics for regions of interest from a
       bed file using Picard'''

    # TS check whether this is always required or specific to current baits
    # file

    # baits file requires modification to make picard accept it
    # this is performed before CalculateHsMetrics
    to_cluster = USECLUSTER
    baits = PARAMS["roi_baits"]
    modified_baits = infile + "_temp_baits_final.bed"
    regions = PARAMS["roi_regions"]
    statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt;
                awk 'NR>2' %(baits)s |
                awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}'
                > %(infile)s_temp_baits.bed;
                cat  %(infile)s_temp_header.txt %(infile)s_temp_baits.bed
                > %(modified_baits)s;
                rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt
                '''
    P.run(statement)

    PipelineMappingQC.buildPicardCoverageStats(infile, outfile, modified_baits,
                                               modified_baits)

    IOTools.zap_file(modified_baits)
Beispiel #3
0
def realignMatchedSample(infile, outfile):
    ''' repeat realignments with merged bam of control and tumor
        this should help avoid problems with sample-specific realignments'''

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    PipelineExome.GATKIndelRealign(infile, outfile, genome)

    IOTools.zap_file(infile)
Beispiel #4
0
def mergeSampleBams(infile, outfile):
    '''merge control and tumor bams'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir(shared=True)

    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])
    infile_tumor = infile.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    infile_base = os.path.basename(infile)
    infile_tumor_base = infile_base.replace(PARAMS["sample_control"],
                                            PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])

    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    control_id = "Control.bam"
    tumor_id = control_id.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    statement = '''picard AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(infile_tumor)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track_tumor)s
                    ID=%(track_tumor)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''samtools merge -rf
                    %(outfile)s
                    %(tmpdir_gatk)s/%(infile_base)s
                    %(tmpdir_gatk)s/%(infile_tumor_base)s;'''
    statement += "samtools index %(outfile)s; "
    statement += "rm -rf %(tmpdir_gatk)s ;"
    P.run(statement)
    IOTools.zap_file(infile)
    IOTools.zap_file(infile_tumor)
Beispiel #5
0
def splitMergedRealigned(infile, outfile):
    ''' split realignment file and truncate intermediate bams'''

    track = P.snip(os.path.basename(infile), ".realigned.bqsr.bam") + ".bqsr"
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])
    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])

    statement = '''samtools view -hb %(infile)s
                   -r %(track)s > %(outfile)s;
                   samtools view -hb %(infile)s
                   -r %(track_tumor)s > %(outfile_tumor)s;
                   samtools index %(outfile)s;
                   samtools index %(outfile_tumor)s;'''
    P.run(statement)
    IOTools.zap_file(infile)
Beispiel #6
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = PARAMS.get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = IOTools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = IOTools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = IOTools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c