예제 #1
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def filterBamfiles(infile, sentinal):
    """
    Pre-process bamfiles prior to peak calling.
    i) remove unmapped readswith bam2bam.py
    i) remove non-uniquely mapping reads with bam2bam.py (optional)
    ii) remove duplicates with Picards MarkDuplicates (optional)
    iii) remove reads from masked regions with bedtools intersect (optional)
    iv) index
    Currently assumes bamfiles are sorted prior to running the pipeline
    """

    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    # ensure bamfile is sorted,
    # prepending samtools sort causes crash.
    # statement = [ "samtools sort @IN@ @OUT@", ]

    statement = []

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @IN@"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=true"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinal)
예제 #2
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = [
        "samtools sort @IN@ @OUT@",
    ]

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)
예제 #3
0
def filterBamfiles(infile, sentinal):
    """
    Pre-process bamfiles prior to peak calling.
    i) remove unmapped readswith bam2bam.py
    i) remove non-uniquely mapping reads with bam2bam.py (optional)
    ii) remove duplicates with Picards MarkDuplicates (optional)
    iii) remove reads from masked regions with bedtools intersect (optional)
    iv) index
    Currently assumes bamfiles are sorted prior to running the pipeline
    """

    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    # ensure bamfile is sorted,
    # prepending samtools sort causes crash.
    # statement = [ "samtools sort @IN@ @OUT@", ]

    statement = []

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @IN@"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=true"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinal)
예제 #4
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = ["samtools sort @IN@ @OUT@", ]

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)