コード例 #1
0
ファイル: pipeline_mrnaseq.py プロジェクト: Tariq-K/RNA
def bamCoverageRNA(infile, outfile):
    '''Make normalised bigwig tracks with deeptools'''

    norm_method = PARAMS["deeptools_norm_method"]

    # STAR MAPQ of 255 indicates uniquely mapped read

    if len(infile) > 0:
        if BamTools.is_paired(infile):
            statement = f'''bamCoverage -b {infile} -o {outfile}
                              --binSize 5
                              --normalizeUsing {norm_method}
                              --samFlagInclude 64
                              --centerReads
                              --minMappingQuality 255
                              --smoothLength 10
                              --skipNAs
                              -p "max" '''

        else:
            statement = f'''bamCoverage -b {infile} -o {outfile}
                              --binSize 5
                              --normalizeUsing {norm_method}
                              --minMappingQuality 255
                              --smoothLength 10
                              --samFlagExclude 4
                              --centerReads
                              -p "max" '''

        P.run(statement, job_memory="2G", job_threads=10)
コード例 #2
0
ファイル: splicing.py プロジェクト: tw7649116/cgat-flow
def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0):
    '''Module to generate rMATS statment

    Module offers the option to permute group name labels and
    calculates readlength, which must be identical in all reads.

    Arguments
    ---------
    gtffile: string
        path to :term:`gtf` file
    designfile: string
        path to design file
    pvalue: string
        threshold for FDR testing
    strand: string
        strandedness option: can be 'fr-unstranded', 'fr-firststrand',
        or 'fr-secondstrand'
    outdir: string
        directory path for rMATS results
    permute : 1 or 0
        option to activate random shuffling of sample groups
    '''

    design = Expression.ExperimentalDesign(designfile)
    if permute == 1:
        permutelist = design.table.group.tolist()
        random.shuffle(permutelist)
        design.table.group = permutelist
    group1 = ",".join(
        ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])])
    with open(outdir + "/b1.txt", "w") as f:
        f.write(group1)
    group2 = ",".join(
        ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])])
    with open(outdir + "/b2.txt", "w") as f:
        f.write(group2)
    readlength = BamTools.estimateTagSize(design.samples[0]+".bam")

    statement = '''rMATS
    --b1 %(outdir)s/b1.txt
    --b2 %(outdir)s/b2.txt
    --gtf <(gunzip -c %(gtffile)s)
    --od %(outdir)s
    --readLength %(readlength)s
    --cstat %(pvalue)s
    --libType %(strand)s
    ''' % locals()

    # if Paired End Reads
    if BamTools.is_paired(design.samples[0]+".bam"):
        statement += '''-t paired''' % locals()

    statement += '''
    > %(outdir)s/%(designfile)s.log
    '''

    P.run(statement, job_condaenv="splicing")
コード例 #3
0
def BAMtotalcounts(infile, outfile):
    '''Count total reads in BAM for normalisation'''

    if bamtools.is_paired(infile):
        statement = f'''samtools view -f 2 {infile} | wc -l | awk 'BEGIN {{OFS="\\t"}} {{print $0/2}}' > {outfile}'''
        # count only reads mapped in proper pairs
    else:
        statement = f'''samtools view -F 4 {infile} | wc -l  | awk 'BEGIN {{OFS="\\t"}} {{print $0}}' > {outfile}'''
        # exclude unmapped reads

    P.run(statement)
コード例 #4
0
def countDEXSeq(infiles, outfile):
    '''create counts for DEXSeq

    Counts bam reads agains exon features in flattened gtf.
    The required python script is provided by DEXSeq
    and uses HTSeqCounts.

    Parameters
    ----------

    infile[0]: string
        :term:`bam` file input

    infile[1]: string
        :term:`gff` output from buildGff function

    outfile : string
        A :term:`txt` file containing results

    DEXSeq_strandedness : string
       :term:`PARAMS`. Specifies strandedness, options
       are 'yes', 'no' and 'reverse'

    '''

    infile, gfffile = infiles
    ps = PYTHONSCRIPTSDIR
    if BamTools.is_paired(infile):
        paired = "yes"
    else:
        paired = "no"
    strandedness = PARAMS["DEXSeq_strandedness"]

    statement = '''python %(ps)s/dexseq_count.py
    -p %(paired)s
    -s %(strandedness)s
    -r pos
    -f bam  %(gfffile)s %(infile)s %(outfile)s'''
    P.run(statement, job_condaenv="splicing")
コード例 #5
0
def scoreIntervalsBAM(infiles, outfile):
    '''Count reads in bed intervals'''

    interval, bam = infiles

    tmp_file = bam.replace(".merge.bam", ".tmp")

    if bamtools.is_paired(bam):
        # -p flag specifes only to count paired reads
        options = "-p"

    else:
        options = " "

    statement = f'''bedtools multicov 
                       {options} 
                       -q 10 
                       -bams {bam} 
                       -bed <(cut -f1-7 {interval} ) 
                       > {outfile} &&
                     sed -i '1i \contig\\tstart\\tend\\tpeak_id\\tpeak_score\\twidth\\tfeature\\ttotal' 
                       {outfile}'''

    P.run(statement)
コード例 #6
0
def convertReadsToIntervals(bamfile,
                            bedfile,
                            filtering_quality=None,
                            filtering_dedup=None,
                            filtering_dedup_method='picard',
                            filtering_nonunique=False):
    '''convert reads in *bamfile* to *intervals*.

    This method converts read data into intervals for
    counting based methods.

    This method is not appropriate for RNA-Seq.

    Optional steps include:

    For paired end data, pairs are merged and optionally
    filtered by insert size.

    Arguments
    ---------
    bamfile : string
        Filename of input file in :term:`bam` format.
    bedfile : string
        Filename of output file in :term:`bed` format.
    filtering_quality : int
        If set, remove reads with a quality score below given threshold.
    filtering_dedup : bool
        If True, deduplicate data.
    filtering_dedup_method : string
        Deduplication method. Possible options are ``picard`` and
        ``samtools``.
    filtering_nonunique : bool
        If True, remove non-uniquely matching reads.

    '''
    track = P.snip(bedfile, ".bed.gz")

    is_paired = BamTools.is_paired(bamfile)
    current_file = bamfile
    tmpdir = P.get_temp_filename()
    statement = ["mkdir %(tmpdir)s"]
    nfiles = 0

    if filtering_quality > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''samtools view
        -q %(filtering_quality)i -b
        %(current_file)s
        2>> %%(bedfile)s.quality.log
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_nonunique:

        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''cat %(current_file)s
        | cgat bam2bam
        --method=filter
        --filter-method=unique,mapped
        --log=%%(bedfile)s.nonunique.log
        2> %%(bedfile)s.nonunique.err
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_dedup is not None:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        if filtering_dedup_method == 'samtools':
            statement.append('''samtools rmdup - - ''')

        elif filtering_dedup_method == 'picard':
            statement.append('''picard MarkDuplicates
            INPUT=%(current_file)s
            OUTPUT=%(next_file)s
            ASSUME_SORTED=TRUE
            METRICS_FILE=%(bedfile)s.duplicate_metrics
            REMOVE_DUPLICATES=TRUE
            VALIDATION_STRINGENCY=SILENT
            >& %%(bedfile)s.markdup.log ''' % locals())

        nfiles += 1
        current_file = next_file

    if is_paired:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --merge-pairs
              --min-insert-size=%(filtering_min_insert_size)i
              --max-insert-size=%(filtering_max_insert_size)i
              --log=%(bedfile)s.bam2bed.log
              -
            2> %(bedfile)s.bam2bed.err
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            2> %(bedfile)s.sanitize.err
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')
    else:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --log=%(bedfile)s.bam2bed.log
              -
            2> %(bedfile)s.bam2bed.err
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            2> %(bedfile)s.sanitize.err
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')

    statement.append("tabix -p bed %(bedfile)s >& %(bedfile)s.tabix.log")
    statement.append("rm -rf %(tmpdir)s")
    statement = " ; ".join(statement)
    P.run(statement, job_memory="8G")