예제 #1
0
def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    ''' Make a BAM file suitable for peak calling.

        Infiles are merged and unmapped reads removed. 

        If specificied duplicate reads are removed. 
        This method use Picard.

        If a mask is specified, reads falling within
        the mask are filtered out. 

        This uses bedtools.

        The mask is a quicksect object containing
        the regions from which reads are to be excluded.
    '''

    # open the infiles, if more than one merge and sort first using samtools.

    samfiles = []
    num_reads = 0
    nfiles = 0

    statement = []

    tmpfile = P.getTempFilename(".")

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        > %(outfile)s.picardlog ''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()
예제 #2
0
def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    ''' Make a BAM file suitable for peak calling.

        Infiles are merged and unmapped reads removed. 

        If specificied duplicate reads are removed. 
        This method use Picard.

        If a mask is specified, reads falling within
        the mask are filtered out. 

        This uses bedtools.

        The mask is a quicksect object containing
        the regions from which reads are to be excluded.
    '''

    # open the infiles, if more than one merge and sort first using samtools.

    samfiles = []
    num_reads = 0
    nfiles = 0

    statement = []

    tmpfile = P.getTempFilename(".")

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        > %(outfile)s.picardlog ''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()
예제 #3
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = [
        "samtools sort @IN@ -o @[email protected]",
    ]

    # remove unmapped reads
    statement.append("cgat bam2bam"
                     " --method=filter --filter-method=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("cgat bam2bam"
                         " --method=filter --filter-method=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_memory = "5G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)
예제 #4
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = ["samtools sort @IN@ @OUT@", ]

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --method=filter --filter-method=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --method=filter --filter-method=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)