Python touch 예제들, cgatcore.pipeline.touch Python 예제들

예제 #1

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)

예제 #2

0

파일 보기

        def splitFiles(infile, outfile):
            '''
            Arbitrarily split files into chunks for parallelisation
            '''

            Timeseries.splitFiles(infile=infile,
                                  nchunks=PARAMS['resampling_chunks'],
                                  out_dir="parallel_files.dir")
            P.touch(outfile)

예제 #3

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)

예제 #4

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: microbialman/cgat-flow

def calculateM3DSpikeClustersPvalue(infiles, outfile):
    job_options = "-l mem_free=4G -pe dedicated 1"
    design = infiles[-1]
    infiles = infiles[:-1]
    RRBS.calculateM3DSpikepvalue(infiles,
                                 outfile,
                                 design,
                                 submit=True,
                                 job_options=job_options)
    P.touch(outfile)

예제 #5

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)

예제 #6

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)

예제 #7

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinel") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER)

    P.touch(outfile)

예제 #8

0

파일 보기

    def genReplicateData(infile, outfile):
        '''
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        '''

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile,
                                   axis="column",
                                   group_var="replicates",
                                   outdir=outdir)

        P.touch(outfile)

예제 #9

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinel)

예제 #10

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def poolInputBamfiles(infiles, sentinel):
    """
    Merge filtered input files for each tissue, with the option of excluding
    undesirable libraries.
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["filter_remove_inputs"].split(",")

    if len(infiles) > 1:
        to_merge = IDR.filterBadLibraries(infiles, bad_samples)
        IDR.mergeBams(to_merge, outfile)
    else:
        os.symlink(os.path.abspath(infiles[0]), outfile)
        os.symlink(os.path.abspath(infiles[0]) + ".bai", outfile + ".bai")

    P.touch(sentinel)

예제 #11

0

파일 보기

def timePointDiffExpression(infile, outfile):
    '''
    Within each condition test for differentially expressed
    genes against the baseline time point.  Uses DESeq2.
    '''

    statement = '''
    cgat timeseries2diffgenes
    --log=%(outfile)s.log
    --method=timepoint
    --alpha=%(deseq_alpha)s
    --results-directory=diff_timepoints.dir
    %(infile)s
    '''

    P.run()

    P.touch(outfile)

예제 #12

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: microbialman/cgat-flow

def summariseReadStart(infile, outfile):
    # this only works for fastq files. Fails with .sra files
    # this function and the next section should be replaced with a call to
    # fastq-dump if the file ends with .sra and then use the functions of
    # the fastq module to count the first bases in the fastq records.
    # for now, create empty outfile
    if infile.endswith(".sra"):
        P.touch(outfile)
    else:
        statement = '''zcat %(infile)s |
        paste - - - - | cut -f2 | cut -c1-3 | sort | uniq -c |
        sort -nk1 | awk -F' ' 'BEGIN{total=0; sum=0}
        {total+=$1; OFS"\\t";
        if($2=="CGG"||$2=="TGG"||$2=="CGA"||$2=="TGA")
        {sum+=$1; print $1, $2}}
        END {print total-sum,"others"}' > %(outfile)s ''' % locals()

        P.run()

예제 #13

0

파일 보기

def loadGeneSummary(infile, outfile):
    '''summarize binding information per gene.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals())
    cc.execute("""CREATE TABLE %(table)s AS
            SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg 
                   FROM promotorinfo_transcripts AS p,
                        annotations.transcript_info as i
                   WHERE i.transcript_id = p.transcript_id
                   GROUP BY gene_id""" % locals())
    cc.close()

    P.touch(outfile)

예제 #14

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinel)

예제 #15

0

파일 보기

파일: PipelineChipseq.py 프로젝트: tw7649116/cgat-flow

def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if iotools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif iotools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()

예제 #16

0

파일 보기

def conditionDiffExpression(infile, outfile):
    '''
    Call DEGs showing statistically significantly
    different expression based on interaction terms between condition
    and time point.  Uses DESeq2.
    '''

    job_options = "-l mem_free=4G"

    statement = '''
    zcat %(infile)s |
    cgat timeseries2diffgenes
    --log=%(outfile)s.log
    --method=condition
    --alpha=%(deseq_alpha)s
    --results-directory=diff_condition.dir
    '''

    P.run()

    P.touch(outfile)

예제 #17

0

파일 보기

파일: PipelineChipseq.py 프로젝트: tw7649116/cgat-flow

def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if iotools.isEmpty(infiles[0]) or iotools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if iotools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if iotools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)

예제 #18

0

파일 보기

    def genResampleData(infile, outfile):
        '''
        Resample the data n-times with replacement - generates
        n flat files which are then propagated at later stages.
        Files are generally small though.
        '''

        time_agg = list(TIME.__dict__['track2groups'].keys())
        time_points = [int(str(x).split("-")[1]) for x in time_agg]
        time_points.sort()
        time_points = list(set(time_points))
        rep_agg = list(REPLICATE.__dict__['track2groups'].keys())
        replicates = [str(x).split("-")[2] for x in rep_agg]
        time_rep_comb = [x for x in itertools.product(time_points, replicates)]
        time_cond = ro.StrVector([x[0] for x in time_rep_comb])
        rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
        ref_gtf = str(infile).split("-")[1]
        condition = (str(infile).split("-")[0]).strip("deseq.dir/")

        time_points = ",".join([str(i) for i in time_points])
        replicates = ",".join(replicates)

        statement = '''
        cgat data2resamples
        --log=%(outfile)s.log
        --time=%(time_points)s
        --replicates=%(replicates)s
        --condition=%(condition)s
        --resamples=%(resampling_resample)s
        --input-gtf=%(ref_gtf)s
        --output-file-directory=clustering.dir
        --seed=%(resampling_seed)s
        %(infile)s
        '''
        P.run()

        P.touch(outfile)

예제 #19

0

파일 보기

def summarizeEffectsPerGene(infile, outfile):
    '''summarize effects on a per-gene level.'''

    tablename = outfile[:-len(".load")]
    track = infile[:-len("_effects.load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT
           gene_id,
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i,
         %(track)s_effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id
    ''' % locals()

    Database.executewait(
        dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals())
    Database.executewait(dbhandle, statement)
    Database.executewait(
        dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals())
    dbhandle.commit()

    P.touch(outfile)

예제 #20

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: microbialman/cgat-flow

def makeSummaryPlots(infile, outfile):

    job_options = "-l mem_free=48G"

    RRBS.summaryPlots(infile, outfile, submit=True, job_options=job_options)
    P.touch(outfile)

예제 #21

0

파일 보기

파일: pipeline_transcriptome.py 프로젝트: tw7649116/cgat-flow

def loadSummary(infile, outfile):
    '''load several rates into a single convenience table.
    '''

    stmt_select = []
    stmt_from = []
    stmt_where = ["1"]

    track = infile[:-len(".gtf.gz")]

    tablename = "%s_evol" % track

    if os.path.exists("%s_rates.load" % track):
        stmt_select.append("a.distance AS ks, a.aligned AS aligned")
        stmt_from.append('''LEFT JOIN %(track)s_rates AS a
        ON r.gene_id = a.gene_id AND
        a.aligned >= %(rates_min_aligned)i AND
        a.distance <= %(rates_max_rate)f''')

    if os.path.exists("%s_coverage.load" % track):
        stmt_select.append("cov.nmatches AS nreads, cov.mean AS meancoverage")
        stmt_from.append(
            "LEFT JOIN %(track)s_coverage AS cov ON r.gene_id = cov.gene_id")

    if os.path.exists("%s_repeats_gc.load" % track):
        stmt_select.append("ar_gc.exons_mean AS repeats_gc")
        stmt_from.append(
            "LEFT JOIN %(track)s_repeats_gc AS ar_gc ON r.gene_id = ar_gc.gene_id"
        )

    if os.path.exists("%s_repeats_rates.load" % track):
        stmt_select.append(
            "ar.exons_length AS ar_aligned, ar.exons_median AS ka, a.distance/ar.exons_median AS kska"
        )
        stmt_from.append('''LEFT JOIN %(track)s_repeats_rates AS ar
                     ON r.gene_id = ar.gene_id AND
                     ar.exons_nval >= %(rates_min_repeats)i''')

    if os.path.exists("%s_introns_rates.load" % track):
        stmt_select.append(
            "ir.aligned AS ir_aligned, ir.distance AS ki, a.distance/ir.distance AS kski"
        )
        stmt_from.append('''LEFT JOIN %(track)s_introns_rates AS ir
                            ON r.gene_id = ir.gene_id AND
                            ir.aligned >= %(rates_min_aligned)i''')

    x = locals()
    x.update(PARAMS)
    stmt_select = ", ".join(stmt_select) % x
    stmt_from = " ".join(stmt_from) % x
    stmt_where = " AND ".join(stmt_where) % x

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s " % locals())

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT
    CAST(r.gene_id AS TEXT) AS gene_id,
    r.exons_sum as length,
    r.exons_pGC as pgc,
    %(stmt_select)s
    FROM
    %(track)s_annotation AS r
    %(stmt_from)s
        WHERE %(stmt_where)s
    ''' % locals()

    Database.executewait(dbhandle, statement)
    dbhandle.commit()
    P.touch(outfile)

예제 #22

0

파일 보기

파일: PipelineChipseq.py 프로젝트: tw7649116/cgat-flow

def loadMACS(infile, outfile, bamfile, tablename=None):
    '''load MACS results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.macs`.

    This method creates two optional additional files:

    * if the file :file:`<track>_diag.xls` is present, load MACS 
      diagnostic data into the table :file:`<track>_macsdiag`.

    * if the file :file:`<track>_model.r` is present, call R to
      create a MACS peak-shift plot and save it as :file:`<track>_model.pdf`
      in the :file:`export/MACS` directory.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.
    '''

    track = P.snip(os.path.basename(infile), ".macs")
    folder = os.path.dirname(infile)
    if len(folder) > 0:
        infilename = folder + "/" + track + "_peaks.xls"
        filename_diag = folder + "/" + track + "_diag.xls"
        filename_r = folder + "/" + track + "_model.r"
        filename_rlog = folder + "/" + track + ".r.log"
        filename_pdf = track + "_model.pdf"
    else:
        infilename = track + "_peaks.xls"
        filename_diag = track + "_diag.xls"
        filename_r = track + "_model.r"
        filename_rlog = track + ".r.log"
        filename_pdf = track + "_model.pdf"

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
        P.touch(outfile)
        return

    # create plot by calling R
    if os.path.exists(filename_r):
        if len(folder) > 0:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; '''
        else:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; '''
        P.run()

    # filter peaks
    shift = getPeakShiftFromMacs(infile)
    assert shift is not None, "could not determine peak shift from MACS file %s" % infile

    E.info("%s: found peak shift of %i" % (track, shift))

    samfiles = [pysam.Samfile(bamfile, "rb")]
    offsets = [shift / 2]

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")
    id = 0

    # get thresholds
    max_qvalue = float(PARAMS["macs_max_qvalue"])
    # min, as it is -10log10
    min_pvalue = float(PARAMS["macs_min_pvalue"])

    counter = E.Counter()
    with iotools.openFile(infilename, "r") as ins:
        for peak in WrapperMACS.iteratePeaks(ins):

            if peak.fdr > max_qvalue:
                counter.removed_qvalue += 1
                continue
            elif peak.pvalue < min_pvalue:
                counter.removed_pvalue += 1
                continue

            assert peak.start < peak.end

            npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                peak.contig, peak.start, peak.end, samfiles, offsets)

            outtemp.write("\t".join(map(str, (
                id, peak.contig, peak.start, peak.end,
                npeaks, peakcenter, length, avgval, peakval, nreads,
                peak.pvalue, peak.fold, peak.fdr,
                peak.start + peak.summit - 1,
                peak.tags))) + "\n")
            id += 1
            counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = iotools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_macs_intervals" % track
    statement = '''cgat csv2db %(csv2db_options)s 
                       --allow-empty-file
                       --add-index=interval_id 
                       --add-index=contig,start
                       --table=%(tablename)s 
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfilename)

    # load diagnostic data
    if os.path.exists(filename_diag):

        tablename = "%s_macsdiag" % track
        statement = '''
        cat %(filename_diag)s 
        | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" 
        | cgat csv2db %(csv2db_options)s 
                  --map=fc:str 
                  --table=%(tablename)s 
        >> %(outfile)s
        '''
        P.run()

예제 #23

0

파일 보기

파일: PipelineMetagenomeCommunities.py 프로젝트: tw7649116/cgat-flow

def plotHeatmap(results, norm_matrix, threshold_stat, p_threshold,
                fc_threshold, outfile):
    '''
    plot heatmap of differentially abundant genes
    '''
    if threshold_stat == "p":
        p = "P.Value"
    elif threshold_stat == "padj":
        p = "adj.P.Val"
    else:
        p = "adj.P.Val"

    temp = P.getTempFilename(".")
    R('''library(gplots)''')
    R('''library(gtools)''')
    E.info("reading data")
    R('''mat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % norm_matrix)
    R('''rownames(mat) <- mat$taxa
         mat <- as.matrix(mat[,1:ncol(mat)-1])''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % results)
    E.info("data loaded")

    R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' %
      (p, p_threshold, fc_threshold))
    R('''diff.genes <- unique(t)''')

    ##############################
    # this is a hack
    # to avoid errors when
    # a single differential
    # abundant feature is found
    ##############################
    R('''write.table(diff.genes,
                     file = "%s",
                     row.names = F,
                     sep = "\t")''' % temp)

    tmp = open(temp)
    tmp.readline()
    if len(tmp.readlines()) == 1:
        P.touch(outfile)
    else:
        R('''mat <- mat[as.character(diff.genes), ]
             samples <- colnames(mat)
             mat <- as.data.frame(t(apply(mat, 1, scale)))
             colnames(mat) <- samples
         mat <- mat[, mixedsort(colnames(mat))]
         colours = colorRampPalette(c("blue", "white", "red"))(75)
         pdf("%s", height = 12, width = 12)
         heatmap.2(as.matrix(mat),
                   trace = "none",
                   scale = "none",
                   col = colours,
                   Colv = F,
                   dendrogram = "row",
                   margins = c(18, 18))
             dev.off()''' % outfile)

    os.unlink(temp)

예제 #24

0

파일 보기

파일: pipeline_idr.py 프로젝트: tw7649116/cgat-flow

def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = ["samtools sort @IN@ -o @[email protected]", ]

    # remove unmapped reads
    statement.append("cgat bam2bam"
                     " --method=filter --filter-method=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("cgat bam2bam"
                         " --method=filter --filter-method=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")
    
    job_memory = "5G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)