Beispiel #1
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    PipelineExome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    PipelineExome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    IOTools.zap_file(outfile1)

    PipelineExome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    IOTools.zap_file(outfile2)
Beispiel #2
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.get_temp_dir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.yml
    with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"),
                           "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles, ), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Beispiel #4
0
def GATKBaseRecal(infile,
                  outfile,
                  genome,
                  intervals,
                  padding,
                  dbsnp,
                  solid_options=""):
    '''Recalibrates base quality scores using GATK'''

    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -L %(intervals)s
                    -ip %(padding)s
                    -I %(infile)s
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    ''' % locals()

    statement += '''GenomeAnalysisTK
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s ;
                    ''' % locals()

    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()
    P.run(statement)
Beispiel #5
0
def mergeSampleBams(infile, outfile):
    '''merge control and tumor bams'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir(shared=True)

    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])
    infile_tumor = infile.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    infile_base = os.path.basename(infile)
    infile_tumor_base = infile_base.replace(PARAMS["sample_control"],
                                            PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])

    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    control_id = "Control.bam"
    tumor_id = control_id.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    statement = '''picard AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(infile_tumor)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track_tumor)s
                    ID=%(track_tumor)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''samtools merge -rf
                    %(outfile)s
                    %(tmpdir_gatk)s/%(infile_base)s
                    %(tmpdir_gatk)s/%(infile_tumor_base)s;'''
    statement += "samtools index %(outfile)s; "
    statement += "rm -rf %(tmpdir_gatk)s ;"
    P.run(statement)
    IOTools.zap_file(infile)
    IOTools.zap_file(infile_tumor)
Beispiel #6
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
Beispiel #7
0
    def __init__(self,
                 save=True,
                 summarize=False,
                 threads=1,
                 qual_format='phred64',
                 *args, **kwargs):
        self.save = save
        self.summarize = summarize
        self.threads = threads
        if self.save:
            self.outdir = "processed.dir"
        else:
            self.outdir = P.get_temp_dir(shared=True)

        self.processors = []
        self.qual_format = qual_format
Beispiel #8
0
def runPicardOnRealigned(infile, outfile):
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir()

    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])
    infile_tumor = infile.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    PipelineMappingQC.buildPicardAlignmentStats(infile, outfile, genome)
    PipelineMappingQC.buildPicardAlignmentStats(infile_tumor, outfile_tumor,
                                                genome)
Beispiel #9
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    tempdir = P.get_temp_dir(".")
    conf_fn = os.path.join(tempdir, "fastq_screen.conf")
    with IOTools.open_file(conf_fn, "w") as f:
        for i, k in PARAMS.items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen(config_filename=conf_fn)
    statement = m.build((infiles,), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)
def runMemeCHIP(infile, outfile, motifs=None):
    '''Run the MEME-CHiP pipeline on the input files.
    optional motifs files can be supplied as a list'''

    if motifs:
        motifs = " ".join("-db %s" % motif for motif in motifs)
    else:
        motifs = " "

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme-chip skipped")
        P.touch(outfile)
        return

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme-chip %(infile)s
             -p %(meme_threads)s 
             -oc %(tmpdir)s
             -nmeme %(memechip_nmeme)s
             %(memechip_options)s     
             %(motifs)s > %(outfile)s.log '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["memechip_threads"]) != 1:
        job_options = str(PARAMS["memechip_job_options"])
        job_threads = int(PARAMS["memechip_threads"])
        cluster_parallel_environment = str(
            PARAMS["memechip_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
Beispiel #11
0
def GATKReadGroups(infile,
                   outfile,
                   genome,
                   library="unknown",
                   platform="Illumina",
                   platform_unit="1",
                   track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''picard ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                 ''' % locals()

    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                 ''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run(statement)
Beispiel #12
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
        return

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme %(infile)s -dna -revcomp
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(motifs_max_size)s
    %(meme_options)s
       > %(outfile)s.log
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
def runDREME(infile, outfile, neg_file="", options=""):
    ''' Run DREME on fasta file. If a neg_file is passed
    then DREME will use this as the negative set, otherwise
    the default is to shuffle the input '''

    nseqs_pos = int(FastaIterator.count(infile))
    if nseqs_pos < 2:
        E.warn("%s: less than 2 sequences - dreme skipped" % outfile)
        P.touch(outfile)
        return

    if neg_file:
        nseqs_neg = int(FastaIterator.count(neg_file))
        if nseqs_neg < 2:
            E.warn(
                "%s: less than 2 sequences in negatives file - dreme skipped" %
                outfile)
            P.touch(outfile)
            return
        else:
            neg_file = "-n %s" % neg_file

    logfile = outfile + ".log"
    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    dreme -p %(infile)s %(neg_file)s -png
        -oc %(tmpdir)s
            %(dreme_options)s
            %(options)s
       > %(logfile)s
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def runMEMEOnSequences(infile, outfile, background=None, psp=None):
    '''run MEME on fasta sequences to find motifs
   
    By defualt MEME calculates a zero-th order background
    model from the nucleotide frequencies in the input set.

    To use a different background set, a background
    file created by fasta-get-markov must be supplied.

    To perform descrimantive analysis a position specific
    prior (psp) file must be provided. This can be generated
    used generatePSP.

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs < 2:
        E.warn("%s: less than 2 sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    # Get the total length of the sequences to decide the memory
    total_seqs_length = 0

    with IOTools.open_file(infile, "r") as fasta_reader:

        iterator_fasta = FastaIterator.iterate(fasta_reader)

        for fasta_seq in iterator_fasta:
            total_seqs_length += len(fasta_seq.sequence)

    fasta_reader.close()

    # If the length of all sequences is higher than 160,000bp
    # Up the memory
    job_memory = "2G"

    if (total_seqs_length > 160000):
        job_memory = "4G"

    if PARAMS.get("meme_revcomp", True):
        revcomp = "-revcomp"
    else:
        revcomp = ""

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")
    if background:
        background_model = "-bfile %s" % background
    else:
        background_model = ""

    if psp:
        E.info("Running MEME in descriminative mode")
        psp_file = "-psp %s" % psp
    else:
        psp_file = ""

    statement = '''
    meme %(infile)s -dna %(revcomp)s
    -p %(meme_threads)s
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(meme_max_size)s
    %(background_model)s
    %(psp_file)s
    %(meme_options)s
       2> %(outfile)s.log
    '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["meme_threads"]) != 1:
        job_options = str(PARAMS["meme_job_options"])
        job_threads = int(PARAMS["meme_threads"])
        cluster_parallel_environment = str(
            PARAMS["meme_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.is_empty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if IOTools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run(statement)

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Beispiel #16
0
 def setUp(self):
     self.work_dir = P.get_temp_dir()
 def setUp(self):
     self.work_dir = P.get_temp_dir(shared=True)