Beispiel #1
0
def runFeatureCounts(annotations_file,
                     bamfile,
                     outfile,
                     nthreads=4,
                     strand=2,
                     options=""):
    '''run feature counts on *annotations_file* with
    *bam_file*.

    If the bam-file is paired, paired-end counting
    is enabled and the bam file automatically sorted.
    '''

    # featureCounts cannot handle gzipped in or out files
    outfile = P.snip(outfile, ".gz")
    tmpdir = P.getTempDir()
    annotations_tmp = os.path.join(tmpdir, 'geneset.gtf')
    bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile))

    # -p -B specifies count fragments rather than reads, and both
    # reads must map to the feature
    # for legacy reasons look at feature_counts_paired
    if BamTools.isPaired(bamfile):
        # select paired end mode, additional options
        paired_options = "-p -B"
        # remove .bam extension
        bam_prefix = P.snip(bam_tmp, ".bam")
        # sort by read name
        paired_processing = \
            """samtools 
                sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; 
            checkpoint; """ % locals()
        bamfile = bam_tmp
    else:
        paired_options = ""
        paired_processing = ""

    job_threads = nthreads

    # AH: what is the -b option doing?
    statement = '''mkdir %(tmpdir)s;
                   zcat %(annotations_file)s > %(annotations_tmp)s;
                   checkpoint;
                   %(paired_processing)s
                   featureCounts %(options)s
                                 -T %(nthreads)i
                                 -s %(strand)s
                                 -b
                                 -a %(annotations_tmp)s
                                 %(paired_options)s
                                 -o %(outfile)s
                                 %(bamfile)s
                    >& %(outfile)s.log;
                    checkpoint;
                    gzip -f %(outfile)s;
                    checkpoint;
                    rm -rf %(tmpdir)s
    '''

    P.run()
Beispiel #2
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path): shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
    def build(self, config):
        '''
        return build statement to be run
        '''
        # output directory
        outdir = "soapdenovo.dir"

        # get track from config file
        for line in open(config).readlines():
            if line.startswith("q2"): continue
            elif line.startswith("q") or line.startswith("q1"):
                track = self.getTrack(line[:-1].split("=")[1])
        

        options = "%(soapdenovo_options)s"
        tempdir = P.getTempDir(".")
        statement = '''%%(soapdenovo_executable)s all 
                       -s %%(infile)s 
                       -o %(tempdir)s/%(track)s
                       -K %%(kmer)s
                       %(options)s; checkpoint;
                       mv %(tempdir)s/%(track)s* %(outdir)s;
                       mv %(outdir)s/%(track)s.contig %(outdir)s/%(track)s.contigs.fa;
                       cat %(outdir)s/%(track)s.contigs.fa 
                       | python %%(scriptsdir)s/rename_contigs.py -a 
                       --log=%(outdir)s/%(track)s.contigs.log  
                       rm -rf %(tempdir)s''' % locals()
        return statement
def runPicardOnRealigned(infile, outfile):
    to_cluster = USECLUSTER
    job_options = getGATKOptions()
    tmpdir_gatk = P.getTempDir('/ifs/scratch')
    threads = PARAMS["gatk_threads"]

    outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"])
    infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace("Control", PARAMS["mutect_tumour"])

    statement = '''
    cat %(infile)s
    | python %%(scriptsdir)s/bam2bam.py -v 0 --set-sequence --bam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s;
    cat %(infile_tumor)s
    | python %%(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa
    ASSUME_SORTED=true
    OUTPUT=%(outfile_tumor)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile_tumor)s;''' % locals()

    P.run()
Beispiel #5
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Beispiel #6
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and
    recalibrates base quality scores using GATK'''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]
    threads = PARAMS["gatk_threads"]
    dbsnp = PARAMS["gatk_dbsnp"]
    solid_options = PARAMS["gatk_solid_options"]
    statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals()
    statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals(
    )
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals()
    statement += '''GenomeAnalysisTK -T RealignerTargetCreator -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals --num_threads %(threads)s -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T IndelRealigner -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ; checkpoint ;''' % locals(
    )
    statement += '''rm -rf %(tmpdir_gatk)s ;'''
    P.run()
Beispiel #7
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and
    recalibrates base quality scores using GATK'''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]
    threads = PARAMS["gatk_threads"]
    dbsnp = PARAMS["gatk_dbsnp"]
    solid_options = PARAMS["gatk_solid_options"]
    statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals(
    )
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals(
    )
    statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals(
    )
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T RealignerTargetCreator -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals --num_threads %(threads)s -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T IndelRealigner -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals(
    )
    statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ; checkpoint ;''' % locals(
    )
    statement += '''rm -rf %(tmpdir_gatk)s ;'''
    P.run()
Beispiel #8
0
    def build(self, config):
        '''
        return build statement to be run
        '''
        # output directory
        outdir = "soapdenovo.dir"

        # get track from config file
        for line in open(config).readlines():
            if line.startswith("q2"): continue
            elif line.startswith("q") or line.startswith("q1"):
                track = self.getTrack(line[:-1].split("=")[1])
        

        options = "%(soapdenovo_options)s"
        tempdir = P.getTempDir(".")
        statement = '''%%(soapdenovo_executable)s all 
                       -s %%(infile)s 
                       -o %(tempdir)s/%(track)s
                       -K %%(kmer)s
                       %(options)s; checkpoint;
                       mv %(tempdir)s/%(track)s* %(outdir)s;
                       mv %(outdir)s/%(track)s.contig %(outdir)s/%(track)s.contigs.fa;
                       cat %(outdir)s/%(track)s.contigs.fa 
                       | python %%(scriptsdir)s/rename_contigs.py -a 
                       --log=%(outdir)s/%(track)s.contigs.log  
                       rm -rf %(tempdir)s''' % locals()
        return statement
    def build(self, infile):

        track = self.getTrack(infile)

        format = self.getFormat(infile)
        if format.endswith(".gz"):
            format = P.snip(format, ".gz")
        format = format.upper()

        # cortex_var only uses paired end information to
        # remove pcr duplicates
        if not self.checkPairs(infile):
            paired = "--se_list"
            reads = os.path.join(os.getcwd(), infile)

        elif len(self.checkPairs(infile)) > 1:
            paired = "--pe_list"
            read1 = infile
            format = P.snip(format, ".1")
            read2 = self.checkPairs(infile)[1]

        elif self.checkPairs(infile) == "interleaved":
            raise ValueError, "pipeline does not support file of type 'interleaved'"

        temp = P.getTempDir()
        read1_new = os.path.join(temp, P.snip(read1, ".1.gz"))
        read2_new = os.path.join(temp, P.snip(read2, ".2.gz"))

        # paired end list
        list1 = open("cortex_var.dir/read1.txt", "w")
        list2 = open("cortex_var.dir/read2.txt", "w")
        list1.write(read1_new + "\n")
        list2.write(read2_new + "\n")
        list1.close()
        list2.close()

        list1 = os.path.abspath("cortex_var.dir/read1.txt")
        list2 = os.path.abspath("cortex_var.dir/read2.txt")

        reads = ",".join([os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]])
        statement = (
            """  gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s  
                       ; cd cortex_var.dir
                       ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s 
                       --format %(format)s
                       --mem_height 15
                       --quality_score_threshold %%(cortex_var_qual_threshold)i 
                       --remove_pcr_duplicates 
                       --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i
                       --sample_id %(track)s
                       --kmer_size %%(kmer)s
                       --dump_binary dump_binary.ctx
                       ; rm -rf %(temp)s
                    """
            % locals()
        )

        return statement
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            if len(self.checkPairs(
                    infile)) > 1:  # check for paired data in separate files
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals(
                )
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        if self.getFormat(infile).find("fastq") != -1 and len(
                self.checkPairs(infile)
        ) > 1:  # reads are fastq and paired in separate files
            mtype = "--merge"  # argument for conversion tool
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(
                infile
        ) == "interleaved":  # reads are fastq and in the same file
            mtype = "--paired"  # argument for conversion tool

        # requires a merge of the fastq files in to fasta format
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa"

                # check if file exists - metaphlan also performs this preprocessing step
                if not os.path.exists(outf):
                    statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s
                                ''' % locals()
                    P.run()
                else:
                    E.info("no need to create file %s - exists" % outf)

            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s'''
                P.run()
                statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
                P.run()
        else:
            statement = None
        return statement
    def build(self, infile):

        track = self.getTrack(infile)

        format = self.getFormat(infile)
        if format.endswith(".gz"):
            format = P.snip(format, ".gz")
        format = format.upper()

        # cortex_var only uses paired end information to
        # remove pcr duplicates
        if not self.checkPairs(infile):
            paired = "--se_list"
            reads = os.path.join(os.getcwd(), infile)

        elif len(self.checkPairs(infile)) > 1:
            paired = "--pe_list"
            read1 = infile
            format = P.snip(format, ".1")
            read2 = self.checkPairs(infile)[1]

        elif self.checkPairs(infile) == "interleaved":
            raise ValueError, "pipeline does not support file of type 'interleaved'"

        temp = P.getTempDir()
        read1_new = os.path.join(temp, P.snip(read1, ".1.gz"))
        read2_new = os.path.join(temp, P.snip(read2, ".2.gz"))

        # paired end list
        list1 = open("cortex_var.dir/read1.txt", "w")
        list2 = open("cortex_var.dir/read2.txt", "w")
        list1.write(read1_new + "\n")
        list2.write(read2_new + "\n")
        list1.close()
        list2.close()

        list1 = os.path.abspath("cortex_var.dir/read1.txt")
        list2 = os.path.abspath("cortex_var.dir/read2.txt")

        reads = ",".join(
            [os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]])
        statement = '''  gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s  
                       ; cd cortex_var.dir
                       ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s 
                       --format %(format)s
                       --mem_height 15
                       --quality_score_threshold %%(cortex_var_qual_threshold)i 
                       --remove_pcr_duplicates 
                       --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i
                       --sample_id %(track)s
                       --kmer_size %%(kmer)s
                       --dump_binary dump_binary.ctx
                       ; rm -rf %(temp)s
                    ''' % locals()

        return statement
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            # check for paired data in separate files
            if len(self.checkPairs(infile)) > 1:
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals()
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        # reads are fastq and paired in separate files
        if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) > 1:
            mtype = "--merge"  # argument for conversion tool
        # reads are fastq and in the same file
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved":
            mtype = "--paired"  # argument for conversion tool

        # requires a merge of the fastq files in to fasta format
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa"

                # check if file exists - metaphlan also performs this
                # preprocessing step
                if not os.path.exists(outf):
                    statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s
                                ''' % locals()
                    P.run()
                else:
                    E.info("no need to create file %s - exists" % outf)

            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s'''
                P.run()
                statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
                P.run()
        else:
            statement = None
        return statement
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            if len(self.checkPairs(
                    infile)) > 1:  # check for paired data in separate files
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
                read1_new = os.path.join(temp, P.snip(infile, ".gz"))
                read2_new = os.path.join(
                    temp, P.snip(self.checkPairs(infile)[1], ".gz"))
                zippy = """gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s; """ % locals()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals(
                )
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        if self.getFormat(infile).find("fastq") != -1 and len(
                self.checkPairs(infile)
        ) > 1:  # reads are fastq and paired in separate files
            mtype = "--merge"  # argument for conversion tool
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(
                infile
        ) == "interleaved":  # reads are fastq and in the same file
            mtype = "--paired"  # argument for conversion tool

        # build statement
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa"
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s
                             ''' % locals()
            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
        else:
            statement = None
        return statement
Beispiel #14
0
    def build(self, infile, PARAMS):
        '''
        run velveth and velvetg
        followed by meta-velvetg
        '''
        outdir = P.getTempDir(".")
        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        if not paired:
            pair = ""
            files = infile
            read_type = "short"
        else:
            pair = "-%s" % paired[0]
            files = " ".join([infile, paired[1]])
            read_type = "shortPaired"

        if format == "fastq.1.gz":
            format = "fastq.gz"
        metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir")
        track = self.getTrack(os.path.basename(infile))

        self.stats_file = track + ".stats.txt"

        if paired:
            insert_length = "-ins_length %i" % PARAMS["velvetg_insert_length"]
        else:
            insert_length = ""

        # velveth and velvetg have to be run to build hash tables and initial
        # de bruijn graphs
        statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -%(read_type)s %(pair)s %(files)s >> %(metavelvet_dir)s/%(track)s_velveth.log
                      ; checkpoint
                      ; mv %(outdir)s/Log %(metavelvet_dir)s/%(track)s.velveth.log
                      ; %%(velvetg_executable)s %(outdir)s -exp_cov auto %(insert_length)s
                      ; checkpoint
                      ; %%(metavelvet_executable)s %(outdir)s %(insert_length)s
                      ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps
                      ; gzip %(metavelvet_dir)s/%(track)s.roadmaps
                      ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences
                      ; gzip %(metavelvet_dir)s/%(track)s.sequences
                      ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2
                      ; gzip %(metavelvet_dir)s/%(track)s.graph2
                      ; cat %(outdir)s/meta-velvetg.contigs.fa | python %%(scriptsdir)s/rename_contigs.py -a metavelvet 
                                                                 --log=%(metavelvet_dir)s/%(track)s.contigs.log
                        >  %(metavelvet_dir)s/%(track)s.contigs.fa
                      ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt
                      ; mv  %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt
                      ; rm -rf %(outdir)s
                      ''' % locals()
        return statement
    def build(self, infile, PARAMS):
        '''
        run velveth and velvetg
        followed by meta-velvetg
        '''
        outdir = P.getTempDir(".")
        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        if not paired:
            pair = ""
            files = infile
            read_type = "short"
        else:
            pair = "-%s" % paired[0]
            files = " ".join([infile, paired[1]])
            read_type = "shortPaired"

        if format == "fastq.1.gz":
            format = "fastq.gz"
        metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir")
        track = self.getTrack(os.path.basename(infile))

        self.stats_file = track + ".stats.txt"

        if paired:
            insert_length = "-ins_length %i" % PARAMS["velvetg_insert_length"]
        else:
            insert_length = ""

        # velveth and velvetg have to be run to build hash tables and initial
        # de bruijn graphs
        statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -%(read_type)s %(pair)s %(files)s >> %(metavelvet_dir)s/%(track)s_velveth.log
                      ; checkpoint
                      ; mv %(outdir)s/Log %(metavelvet_dir)s/%(track)s.velveth.log
                      ; %%(velvetg_executable)s %(outdir)s -exp_cov auto %(insert_length)s
                      ; checkpoint
                      ; %%(metavelvet_executable)s %(outdir)s %(insert_length)s
                      ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps
                      ; gzip %(metavelvet_dir)s/%(track)s.roadmaps
                      ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences
                      ; gzip %(metavelvet_dir)s/%(track)s.sequences
                      ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2
                      ; gzip %(metavelvet_dir)s/%(track)s.graph2
                      ; cat %(outdir)s/meta-velvetg.contigs.fa | python %%(scriptsdir)s/rename_contigs.py -a metavelvet 
                                                                 --log=%(metavelvet_dir)s/%(track)s.contigs.log
                        >  %(metavelvet_dir)s/%(track)s.contigs.fa
                      ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt
                      ; mv  %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt
                      ; rm -rf %(outdir)s
                      ''' % locals()
        return statement
def trimReads( infile, outfile ):
    '''trim reads with FastX'''
    to_cluster = True

    tmpdir_fastq = P.getTempDir()
    track = P.snip( os.path.basename( infile ), ".gz" )
    statement = """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py 
                       --change-format=sanger 
                       --guess-format=phred64 
                       --log=%(outfile)s.log
                   > %(tmpdir_fastq)s/%(track)s;""" % locals()
    statement += """zcat %(infile)s | fastx_trimmer -f %(first_base)s -l %(last_base)s -z -o %(outfile)s """

    P.run()
Beispiel #17
0
def trimReads(infile, outfile):
    '''trim reads with FastX'''
    to_cluster = True

    tmpdir_fastq = P.getTempDir()
    track = P.snip(os.path.basename(infile), ".gz")
    statement = """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py 
                       --change-format=sanger 
                       --guess-format=phred64 
                       --log=%(outfile)s.log
                   > %(tmpdir_fastq)s/%(track)s;""" % locals()
    statement += """zcat %(infile)s | fastx_trimmer -f %(first_base)s -l %(last_base)s -z -o %(outfile)s """

    P.run()
Beispiel #18
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
Beispiel #19
0
def assignEssentialGenesToContigs(infile, outfile):
    '''
    assign essential genes to contigs
    '''
    dirname = os.path.dirname(infile)
    essential = PARAMS["hmmer_hmm"]
    tempdir = P.getTempDir(".")

    statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa;
    hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc
    --notextw  %(essential)s %(tempdir)s/orfs.fa;
    tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " "
    | gzip > %(outfile)s'''
    P.run()
    statement = '''rm -rf %(tempdir)s'''
    P.run()
Beispiel #20
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(track, tmpfasta,
                                      dbhandle,
                                      full=False,
                                      masker=P.asList(PARAMS['motifs_masker']),
                                      halfwidth=int(PARAMS["meme_halfwidth"]),
                                      maxsize=int(PARAMS["meme_max_size"]),
                                      proportion=PARAMS["meme_proportion"],
                                      min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None
        
        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            if len(self.checkPairs(infile)) > 1: # check for paired data in separate files
               read1 = infile 
               read2 = self.checkPairs(infile)[1]
               temp = P.getTempDir()
               read1_new = os.path.join(temp, P.snip(infile, ".gz"))
               read2_new = os.path.join(temp, P.snip(self.checkPairs(infile)[1], ".gz"))
               zippy = """gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s; """ % locals()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz")) 
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals()
        else:
            zippy = ""
        
        # only need to convert if the data are in fastq format
        if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) >1: # reads are fastq and paired in separate files
            mtype = "--merge" # argument for conversion tool
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved": # reads are fastq and in the same file
            mtype = "--paired" # argument for conversion tool

        # build statement
        if mtype: # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa" 
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s
                             ''' % locals()
            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
        else:
            statement = None
        return statement
    def build(self, infile):
        '''
        build statement for running idba
        input is merged fasta file and 
        output is contigs fasta file
        '''
        track = self.getTrack(infile)
        outdir = "idba.dir"

        # get temporary file for running idba
        tempdir = P.getTempDir()

        # NB at the moment we assume the default maxkmer of 100
        statement = '''%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s
                       ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa
                       ; mv %(tempdir)s/contig-%%(idba_maxkmer)s.fa idba.dir/%(track)s.contigs.fa''' % locals()

        shutil.rmtree(tempdir)
        return statement
Beispiel #23
0
def buildCoverageOverContigs(infiles, outfile):
    '''
    build histograms of the coverage over each of the contigs
    '''
    bam = infiles[0]
    # genomecoveragebed does not like some of the 
    # output from bwa. bwa outputs some reads
    # that map off the end of contigs
    # as having a leftmost position of 0. This is
    # not ideal. Need to use temporary bam
    # files with only mapped reads - this is 
    # nasty and needs changing
    tempdir = P.getTempDir(".")
    tempname = P.getTempFilename(tempdir) + ".bam"
    P.submit("CGATPipelines.PipelineMetagenomeAssembly", 
             "filterBamOnPos", 
             infiles = bam, 
             outfiles = tempname)

    # tablename where alignment stats live
    tablename = os.path.dirname(
        bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats"

    # hack to convert to table - add .load
    tablename = P.toTable(tablename + ".load")
    
    # connect to database
    dbh = connect()
    cc = dbh.cursor()

    # get number of reads aligned from bam2stats
    if PARAMS.get("coverage_scale"):
        scale_factor = cc.execute("""SELECT counts FROM %s
                                     WHERE category == 'reads_mapped'""" % tablename).fetchone()[0]
        scale_factor = 1 / (float(scale_factor) / 1000000)
        scale_options = "-scale %(scale_factor)f"
    else:
        scale_options = ""

    statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s;
                   rm -rf %(tempdir)s'''
    P.run()
Beispiel #24
0
    def build(self, infile):
        '''
        build statement for running spades
        '''
        track = self.getTrack(os.path.basename(infile))

        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        tempdir = P.getTempDir(".")
        outdir = "spades.dir"

        # input files
        if not paired:
            files = infile
            files_statement = "-s %s" % files
        else:
            # spades doesn't like the fastq.1.gz type format
            temp1 = os.path.join(tempdir, track+".1.fastq")
            temp2 = os.path.join(tempdir, track+".2.fastq")
            infile2 = paired[1]
            unzip_statement = "zcat %(infile)s > %(temp1)s; zcat %(infile2)s > %(temp2)s" % locals()
            files_statement = " -1 " + " -2 ".join( [temp1, temp2] )

        # kmer to use
        k = "-k %(kmer)s"
        
        # spades options
        spades_options = "%(spades_options)s"

        # deal with spades output
        move_statement = """mv %(tempdir)s/corrected/%(track)s*cor.* %(outdir)s; \
                          mv %(tempdir)s/contigs.fasta %(outdir)s/%(track)s.contigs.fa; \
                          mv %(tempdir)s/scaffolds.fasta %(outdir)s/%(track)s.scaffolds.fa; \
                          mv %(tempdir)s/spades.log %(outdir)s/%(track)s.contigs.log""" % locals()
        
        # statement - simple and default
        statement = '''%(unzip_statement)s; checkpoint; 
                       spades.py %(files_statement)s -o %(tempdir)s %(k)s %(spades_options)s; checkpoint;
                       %(move_statement)s; checkpoint;
                       rm -rf %(tempdir)s %(temp1)s %(temp2)s''' % locals()
        return statement
Beispiel #25
0
    def build(self, infile):
        '''
        build statement for running idba
        input is merged fasta file and 
        output is contigs fasta file
        '''
        track = self.getTrack(infile)
        outdir = "idba.dir"

        # get temporary file for running idba
        tempdir = P.getTempDir()

        # NB at the moment we assume the default maxkmer of 100
        statement = '''%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s
                       ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa
                       ; cat %(tempdir)s/contig-%%(idba_maxkmer)s.fa | python %%(scriptsdir)s/rename_contigs.py -a idba --log=%(outdir)s/%(track)s.contigs.log
                        > idba.dir/%(track)s.contigs.fa''' % locals()

        shutil.rmtree(tempdir)
        return statement
Beispiel #26
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
Beispiel #27
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
def buildClusters( infiles, outfiles ):
    '''run c-means clustering on expression level data.'''

    to_cluster = USECLUSTER
    job_options = "-l mem_free=10G"

    # ignore the background file (why is it included in infiles?)
    infile, _ = infiles
    instructions_filename, centroid_filename, membership_filename = outfiles

    instructions_filename = os.path.abspath( instructions_filename )
    cdt_filename = os.path.abspath( infile )
    kmeans_clusters = PARAMS["kmeans_clusters"]

    # run aerie in a temporary directory
    tmpdir = P.getTempDir(".")

    with open( instructions_filename, "w" ) as outf:
       outf.write( '''load %(cdt_filename)s
fuzzy %(kmeans_clusters)i
%(tmpdir)s/all
exit
''' % locals())
       
    statement = '''
    aerie < %(instructions_filename)s >& %(instructions_filename)s.log
    '''
    P.run()

    try:
        shutil.move( os.path.join( tmpdir, "all.fct"), centroid_filename )
        shutil.move( os.path.join( tmpdir, "all.mb"), membership_filename )
    except IOError,msg:
        E.warn("no results for %s,%s: %s" % (centroid_filename,
                                             membership_filename,
                                             msg))
        P.touch( centroid_filename )
        P.touch( membership_filename )
    def build(self, infile):
        """
        build statement for running idba
        input is merged fasta file and 
        output is contigs fasta file
        """
        track = self.getTrack(infile)
        outdir = "idba.dir"

        # get temporary file for running idba
        tempdir = P.getTempDir()

        # NB at the moment we assume the default maxkmer of 100
        statement = (
            """%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s
                       ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa
                       ; cat %(tempdir)s/contig-%%(idba_maxkmer)s.fa | python %%(scriptsdir)s/rename_contigs.py -a idba --log=%(outdir)s/%(track)s.contigs.log
                        > idba.dir/%(track)s.contigs.fa"""
            % locals()
        )

        shutil.rmtree(tempdir)
        return statement
Beispiel #30
0
def buildClusters(infiles, outfiles):
    '''run c-means clustering on expression level data.'''

    to_cluster = USECLUSTER
    job_options = "-l mem_free=10G"

    # ignore the background file (why is it included in infiles?)
    infile, _ = infiles
    instructions_filename, centroid_filename, membership_filename = outfiles

    instructions_filename = os.path.abspath(instructions_filename)
    cdt_filename = os.path.abspath(infile)
    kmeans_clusters = PARAMS["kmeans_clusters"]

    # run aerie in a temporary directory
    tmpdir = P.getTempDir(".")

    with open(instructions_filename, "w") as outf:
        outf.write('''load %(cdt_filename)s
fuzzy %(kmeans_clusters)i
%(tmpdir)s/all
exit
''' % locals())

    statement = '''
    aerie < %(instructions_filename)s >& %(instructions_filename)s.log
    '''
    P.run()

    try:
        shutil.move(os.path.join(tmpdir, "all.fct"), centroid_filename)
        shutil.move(os.path.join(tmpdir, "all.mb"), membership_filename)
    except IOError, msg:
        E.warn("no results for %s,%s: %s" %
               (centroid_filename, membership_filename, msg))
        P.touch(centroid_filename)
        P.touch(membership_filename)
    def build(self, infile):
        '''
        run velveth and velvetg
        followed by meta-velvetg
        '''
        outdir = P.getTempDir()
        format = self.getFormat(infile)
        paired = self.checkPairs(infile)
        if len(paired) > 1:
            pair = paired[0]
            files = " ".join([infile, paired[1]])
        else:
            pair = paired
            files = infile
        if format == "fastq.1.gz":
            format = "fastq.gz"
        metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir")
        track = self.getTrack(infile)
        
        self.stats_file = track + ".stats.txt"

        # velveth and velvetg have to be run to build hash tables and initial de bruijn graphs
        statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -shortPaired -%(pair)s %(files)s
                      ; cd %(outdir)s; %%(velvetg_executable)s %(outdir)s -exp_cov auto -ins_length %%(velvetg_insert_length)i
                      ; %%(metavelvet_executable)s %(outdir)s -ins_length %%(velvetg_insert_length)i
                      ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps
                      ; gzip %(metavelvet_dir)s/%(track)s.roadmaps
                      ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences
                      ; gzip %(metavelvet_dir)s/%(track)s.sequences
                      ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2
                      ; gzip %(metavelvet_dir)s/%(track)s.graph2
                      ; mv %(outdir)s/meta-velvetg.contigs.fa %(metavelvet_dir)s/%(track)s.contigs.fa
                      ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt
                      ; mv  %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt
                      ; rm -rf %(outdir)s''' % locals()
        return statement
    def build(self, infile):
        '''
        run velveth and velvetg
        followed by meta-velvetg
        '''
        outdir = P.getTempDir()
        format = self.getFormat(infile)
        paired = self.checkPairs(infile)
        if len(paired) > 1:
            pair = paired[0]
            files = " ".join([infile, paired[1]])
        else:
            pair = paired
            files = infile
        if format == "fastq.1.gz":
            format = "fastq.gz"
        metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir")
        track = self.getTrack(infile)

        self.stats_file = track + ".stats.txt"

        # velveth and velvetg have to be run to build hash tables and initial de bruijn graphs
        statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -shortPaired -%(pair)s %(files)s
                      ; cd %(outdir)s; %%(velvetg_executable)s %(outdir)s -exp_cov auto -ins_length %%(velvetg_insert_length)i
                      ; %%(metavelvet_executable)s %(outdir)s -ins_length %%(velvetg_insert_length)i
                      ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps
                      ; gzip %(metavelvet_dir)s/%(track)s.roadmaps
                      ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences
                      ; gzip %(metavelvet_dir)s/%(track)s.sequences
                      ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2
                      ; gzip %(metavelvet_dir)s/%(track)s.graph2
                      ; mv %(outdir)s/meta-velvetg.contigs.fa %(metavelvet_dir)s/%(track)s.contigs.fa
                      ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt
                      ; mv  %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt
                      ; rm -rf %(outdir)s''' % locals()
        return statement
def realignMatchedSample(infile, outfile):
    ''' repeat realignments with merged bam of control and tumor
        this should help avoid problems with sample-specific realignments'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files

    to_cluster = USECLUSTER
    job_options = getGATKOptions()
    tmpdir_gatk = P.getTempDir('/ifs/scratch')
    threads = PARAMS["gatk_threads"]

    outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"])
    infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace("Control", PARAMS["mutect_tumour"])
    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    statement = '''module unload apps/java/jre1.6.0_26; checkpoint;'''
    statement += '''AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/control.bam
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=Control
                    VALIDATION_STRINGENCY=SILENT ;
                    checkpoint ;''' % locals()
    statement += '''samtools view -H %(tmpdir_gatk)s/control.bam
                    > %(tmpdir_gatk)s/header.sam;
                    samtools view -H %(infile_tumor)s | grep "^@RG"
                    >> %(tmpdir_gatk)s/header.sam;
                    samtools merge
                    -h %(tmpdir_gatk)s/header.sam
                    %(tmpdir_gatk)s/merged.bam
                    %(tmpdir_gatk)s/control.bam
                    %(infile_tumor)s
                    ; checkpoint ;''' % locals()
    statement += '''samtools index %(tmpdir_gatk)s/merged.bam;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T RealignerTargetCreator
                    -o %(tmpdir_gatk)s/merged.indelrealignment.intervals
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/merged.bam ;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T IndelRealigner
                    -o %(tmpdir_gatk)s/merged.indelrealigned.bam
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/merged.bam
                    -targetIntervals
                    %(tmpdir_gatk)s/merged.indelrealignment.intervals;
                    checkpoint ;''' % locals()
    statement += '''samtools view -hb
                    %(tmpdir_gatk)s/merged.indelrealigned.bam
                    -r Control > %(outfile)s;
                    samtools view -hb
                    %(tmpdir_gatk)s/merged.indelrealigned.bam
                    -r 1 > %(outfile_tumor)s;
                    samtools index %(outfile)s;
                    samtools index %(outfile_tumor)s;
                    checkpoint;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;'''
    print statement
    P.run()
Beispiel #34
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    to_cluster = True

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Beispiel #35
0
def runFeatureCounts(annotations_file,
                     bamfile,
                     outfile,
                     nthreads=4,
                     strand=2,
                     options=""):
    '''run feature counts on *annotations_file* with
    *bam_file*.
    
    If the bam-file is paired, paired-end counting
    is enabled and the bam file automatically sorted.
    '''

    # featureCounts cannot handle gzipped in or out files
    outfile = P.snip(outfile, ".gz")
    tmpdir = P.getTempDir()
    annotations_tmp = os.path.join(tmpdir,
                                   'geneset.gtf')
    bam_tmp = os.path.join(tmpdir,
                           bamfile)

    # -p -B specifies count fragments rather than reads, and both
    # reads must map to the feature
    # for legacy reasons look at feature_counts_paired
    if BamTools.isPaired(bamfile):
        # select paired end mode, additional options
        paired_options = "-p -B"
        # remove .bam extension
        bam_prefix = P.snip(bam_tmp, ".bam")
        # sort by read name
        paired_processing = \
            """samtools 
                sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; 
            checkpoint; """ % locals()
        bamfile = bam_tmp 
    else:
        paired_options = ""
        paired_processing = ""

    job_options = "-pe dedicated %i" % nthreads

    # AH: what is the -b option doing?
    statement = '''mkdir %(tmpdir)s;
                   zcat %(annotations_file)s > %(annotations_tmp)s;
                   checkpoint;
                   %(paired_processing)s
                   featureCounts %(options)s
                                 -T %(nthreads)i
                                 -s %(strand)s
                                 -b
                                 -a %(annotations_tmp)s
                                 %(paired_options)s
                                 -o %(outfile)s
                                 %(bamfile)s
                    >& %(outfile)s.log;
                    checkpoint;
                    gzip -f %(outfile)s;
                    checkpoint;
                    rm -rf %(tmpdir)s
    '''

    P.run()
    def build(self, infile):
        '''
        build statement for running Ray
        '''
        track = self.getTrack(infile)
     
        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        tempdir = P.getTempDir()
        # check whether the data are paired-end
        if not paired:
            pair = paired
            files = os.path.join(tempdir, P.snip(infile, ".gz"))
            gunzy = "gunzip -c %(infile)s > %(files)s" % locals()
        else:
            pair = paired[0]
            # Ray doesn't like .fastq.1.gz etc
            read1 = infile
            read2 = paired[1]
            read1_new = os.path.join(tempdir,read1.replace(".fastq.1.gz", ".1.fastq"))
            read2_new = os.path.join(tempdir,read2.replace(".fastq.2.gz", ".2.fastq"))
            files = " ".join([read1_new, read2_new])
            gunzy = """gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s""" % locals()
             
        # ray likes an output directory but needs it not
        # to exist beforehand
        raydir = os.path.join(os.getcwd(), "ray.dir/export_%s" % track)
        raydir_orig = os.path.join(os.getcwd(), "ray.dir")
        
        # Ray picks up file types so should just have to
        # say whether its paired or not
        # build statement
        common_options = "-k %(kmer)s"
        if pair == "interleaved":
            filetype = "-i"
        elif not pair:
            filetype = "-s"
        elif pair == "separate":
            filetype = "-p"
        else:
            raise IOError, "do not support file of this type: %s" % infile

        # note restrict use to 5 cores
        
        statement = ''' %(gunzy)s
                       ; mpiexec -n 5 %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s
                       ; checkpoint; mv %(raydir)s/Scaffolds.fasta %(raydir_orig)s/%(track)s.scaffolds.fa
                       ; mv %(raydir)s/ScaffoldComponents.txt %(raydir_orig)s/%(track)s.scaffold_components.txt
                       ; mv %(raydir)s/ScaffoldLengths.txt %(raydir_orig)s/%(track)s.scaffold_lengths.txt
                       ; mv %(raydir)s/ScaffoldLinks.txt %(raydir_orig)s/%(track)s.scaffold_links.txt
                       ; mv %(raydir)s/Contigs.fasta %(raydir_orig)s/%(track)s.contigs.fa
                       ; mv %(raydir)s/OutputNumbers.txt %(raydir_orig)s/%(track)s.numbers.txt
                       ; mv %(raydir)s/CoverageDistribution.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution.txt
                       ; mkdir %(raydir)s/graph
                       ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution_analysis.txt
                       ; mv %(raydir)s/degreeDistribution.txt %(raydir_orig)s/graph/%(track)s.degree_distribution.txt
                       ; mv %(raydir)s/Kmers.txt %(raydir_orig)s/graph/%(track)s.kmers.txt
                       ; mkdir %(raydir)s/assembly
                       ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir_orig)s/assembly/%(track)s.seed_length_distribution.txt
                       ; mv %(raydir)s/LibraryStatistics.txt %(raydir_orig)s/%(track)s.library_statistics.txt
                       ; mv %(raydir)s/LibraryData.xml %(raydir_orig)s/%(track)s.library_data.xml 
                       ; rm -rf %(tempdir)s''' % locals()
        return statement
Beispiel #37
0
def buildCodingPotential(infile, outfile):
    '''run CPC analysis as in the cpc script.

    This module runs framefinder and blastx on both strands.
    It seems to work, but I have not thoroughly tested it.
    I expect that the false positive rate increases (i.e.,
    predicting non-coding as coding) in cases where the best 
    framefinder match and the best blast match are on opposite
    strands. In the original CPC, these would be separated.
    '''

    try:
        cpc_dir = os.environ["CPC_HOME"]
    except KeyError:
        raise ValueError("CPC_HOME environment variable is not set. ")

    tmpdir = P.getTempDir(".")
    track = P.snip(outfile, ".coding.gz")

    # extract features for frame finder
    # replaces extract_framefinder_feats.pl to parse both strands
    with open(os.path.join(tmpdir, "ff.feat"), "w") as outf:
        outf.write("\t".join(("QueryID", "CDSLength", "Score", "Used",
                              "Strict")) + "\n")
        for line in IOTools.openFile("%s.frame.gz" % track):
            if line.startswith(">"):
                try:
                    (id, start, end, score, used, mode, tpe) = \
                        re.match(
                            ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups()
                except AttributeError:
                    raise ValueError("parsing error in line %s" % line)
                length = int(end) - int(start) + 1
                strict = int(tpe == "strict")
                outf.write("\t".join((id, str(length), used, str(strict))) +
                           "\n")

    to_cluster = USECLUSTER

    # extract features and prepare svm data
    s = []

    s.append('''
    zcat %(infile)s
    | perl %(cpc_dir)s/libs/blast2table.pl 
    | tee %(tmpdir)s/blastx.table
    | perl %(cpc_dir)s/bin/extract_blastx_features.pl
    > %(tmpdir)s/blastx.feat1;
    ''')

    s.append('''
    cat %(track)s_norepeats.fasta 
    | perl %(cpc_dir)s/bin/add_missing_entries.pl
       %(tmpdir)s/blastx.feat1 
    > %(tmpdir)s/blastx.feat;
    ''')

    # step 2 - prepare data
    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat 
    > %(tmpdir)s/blastx.lsv;
    ''')

    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat 
    > %(tmpdir)s/ff.lsv;
    ''')

    s.append('''
    perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv 
    > %(tmpdir)s/test.lsv;
    ''')

    s.append('''
    %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale 
               -r %(cpc_dir)s/data/libsvm.range  
               %(tmpdir)s/test.lsv 
    > %(tmpdir)s/test.lsv.scaled;
    ''')

    # step 3: prediction
    m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0")  # standard
    m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model")  # Prob
    m_libsvm_model2 = os.path.join(
        cpc_dir, "data/libsvm.model2")  # Prob + weighted version
    m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range")

    s.append('''
               %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2
               %(tmpdir)s/test.lsv.scaled 
               %(m_libsvm_model0)s 
               %(tmpdir)s/test.svm0.predict 
    > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr;
    ''')

    s.append('''
    printf "gene_id\\tlength\\tresult\\tvalue\\n" 
    | gzip > %(outfile)s;
    cat %(tmpdir)s/test.svm0.predict  
    | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta 
    | gzip >> %(outfile)s;
    ''')

    # generate reports
    s.append('''cat %(tmpdir)s/blastx.feat
    | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz)
    | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf;
    gzip %(outfile)s.orf %(outfile)s.homology;
    ''')

    # now run it all
    statement = " checkpoint; ".join(s)
    P.run()

    # clean up
    shutil.rmtree(tmpdir)
Beispiel #38
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = ["samtools sort @IN@ @OUT@", ]

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)
Beispiel #39
0
    def build(self, infile):
        '''
        build statement for running Ray
        '''
        track = self.getTrack(os.path.basename(infile))

        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        tempdir = P.getTempDir(dir=".")

        # check whether the data are paired-end
        if not paired:
            pair = paired
            files = os.path.join(
                tempdir, P.snip(os.path.basename(infile), ".gz"))
            gunzy = "gunzip -c %(infile)s > %(files)s" % locals()
        else:
            pair = paired[0]
            # Ray doesn't like .fastq.1.gz etc
            read1 = infile
            read2 = paired[1]
            read1_new = os.path.join(
                tempdir, read1.replace(".fastq.1.gz", ".1.fastq"))
            read2_new = os.path.join(
                tempdir, read2.replace(".fastq.2.gz", ".2.fastq"))
            files = " ".join([read1_new, read2_new])
            gunzy = """gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s""" % locals()

        # ray likes an output directory but needs it not
        # to exist beforehand
        raydir = os.path.join(os.getcwd(), "ray.dir/export_%s" % track)
        raydir_orig = os.path.join(os.getcwd(), "ray.dir")

        # Ray picks up file types so should just have to
        # say whether its paired or not
        # build statement
        common_options = "-k %(kmer)s"
        if pair == "interleaved":
            filetype = "-i"
        elif not pair:
            filetype = "-s"
        elif pair == "separate":
            filetype = "-p"
        else:
            raise IOError, "do not support file of this type: %s" % infile

        # note restrict use to 10 cores

        statement = ''' %(gunzy)s
                       ; mpiexec %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s >> %(raydir_orig)s/%(track)s.log
                       ; checkpoint; mv %(raydir)s/Scaffolds.fasta %(raydir_orig)s/%(track)s.scaffolds.fa
                       ; mv %(raydir)s/ScaffoldComponents.txt %(raydir_orig)s/%(track)s.scaffold_components.txt
                       ; mv %(raydir)s/ScaffoldLengths.txt %(raydir_orig)s/%(track)s.scaffold_lengths.txt
                       ; mv %(raydir)s/ScaffoldLinks.txt %(raydir_orig)s/%(track)s.scaffold_links.txt
                       ; cat %(raydir)s/Contigs.fasta | python %%(scriptsdir)s/rename_contigs.py -a ray --log=%(raydir_orig)s/%(track)s.contigs.log
                         > %(raydir_orig)s/%(track)s.contigs.fa
                       ; mv %(raydir)s/OutputNumbers.txt %(raydir_orig)s/%(track)s.numbers.txt
                       ; mv %(raydir)s/CoverageDistribution.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution.txt
                       ; mkdir %(raydir)s/graph
                       ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution_analysis.txt
                       ; mv %(raydir)s/degreeDistribution.txt %(raydir_orig)s/graph/%(track)s.degree_distribution.txt
                       ; mv %(raydir)s/Kmers.txt %(raydir_orig)s/graph/%(track)s.kmers.txt
                       ; mkdir %(raydir)s/assembly
                       ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir_orig)s/assembly/%(track)s.seed_length_distribution.txt
                       ; mv %(raydir)s/LibraryStatistics.txt %(raydir_orig)s/%(track)s.library_statistics.txt
                       ; mv %(raydir)s/LibraryData.xml %(raydir_orig)s/%(track)s.library_data.xml 
                       ; rm -rf %(tempdir)s''' % locals()

        return statement
def buildCodingPotential( infile, outfile ):
    '''run CPC analysis as in the cpc script.

    This module runs framefinder and blastx on both strands.
    It seems to work, but I have not thoroughly tested it.
    I expect that the false positive rate increases (i.e.,
    predicting non-coding as coding) in cases where the best 
    framefinder match and the best blast match are on opposite
    strands. In the original CPC, these would be separated.
    '''

    try:
        cpc_dir = os.environ["CPC_HOME"]
    except KeyError:
        raise ValueError("CPC_HOME environment variable is not set. ")

    tmpdir = P.getTempDir( ".")
    track = P.snip( outfile, ".coding.gz" )

    # extract features for frame finder
    # replaces extract_framefinder_feats.pl to parse both strands
    with open( os.path.join(tmpdir, "ff.feat"), "w") as outf:
        outf.write( "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n")
        for line in IOTools.openFile( "%s.frame.gz" % track ):
            if line.startswith(">"):
                try:
                    ( id, start, end, score, used, mode, tpe) = \
                        re.match(
                        ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line ).groups()
                except AttributeError:
                    raise ValueError( "parsing error in line %s" % line )
                length = int(end) - int(start) + 1
                strict = int(tpe == "strict")
                outf.write( "\t".join( (id, str(length), used, str(strict )) )+ "\n")

    to_cluster = USECLUSTER

    # extract features and prepare svm data
    s = []
            
    s.append( '''
    zcat %(infile)s
    | perl %(cpc_dir)s/libs/blast2table.pl 
    | tee %(tmpdir)s/blastx.table
    | perl %(cpc_dir)s/bin/extract_blastx_features.pl
    > %(tmpdir)s/blastx.feat1;
    ''' )

    s.append( '''
    cat %(track)s_norepeats.fasta 
    | perl %(cpc_dir)s/bin/add_missing_entries.pl
       %(tmpdir)s/blastx.feat1 
    > %(tmpdir)s/blastx.feat;
    ''')

    # step 2 - prepare data
    s.append( '''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat 
    > %(tmpdir)s/blastx.lsv;
    ''' )
    
    s.append( '''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat 
    > %(tmpdir)s/ff.lsv;
    ''' )

    s.append( '''
    perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv 
    > %(tmpdir)s/test.lsv;
    ''' )

    s.append( '''
    %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale 
               -r %(cpc_dir)s/data/libsvm.range  
               %(tmpdir)s/test.lsv 
    > %(tmpdir)s/test.lsv.scaled;
    ''' )
    
    # step 3: prediction
    m_libsvm_model0=os.path.join( cpc_dir, "data/libsvm.model0") # standard
    m_libsvm_model=os.path.join( cpc_dir, "data/libsvm.model") # Prob
    m_libsvm_model2=os.path.join( cpc_dir, "data/libsvm.model2" )	# Prob + weighted version
    m_libsvm_range=os.path.join( cpc_dir, "data/libsvm.range" )

    s.append( '''
               %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2
               %(tmpdir)s/test.lsv.scaled 
               %(m_libsvm_model0)s 
               %(tmpdir)s/test.svm0.predict 
    > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr;
    ''' )

    s.append( '''
    printf "gene_id\\tlength\\tresult\\tvalue\\n" 
    | gzip > %(outfile)s;
    cat %(tmpdir)s/test.svm0.predict  
    | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta 
    | gzip >> %(outfile)s;
    ''' )

    # generate reports
    s.append( '''cat %(tmpdir)s/blastx.feat
    | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) 
    | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf;
    gzip %(outfile)s.orf %(outfile)s.homology;
    ''' )

    # now run it all
    statement = " checkpoint; ".join( s )
    P.run()

    # clean up
    shutil.rmtree( tmpdir )
    def preprocess( self, infiles, outfile ):
        '''build preprocessing statement

        Build a command line statement that extracts/converts 
        various input formats to fastq formatted files.

        Mapping qualities are changed to solexa format.

        returns the statement and the fastq files to map.
        '''

        assert len(infiles) > 0, "no input files for mapping"

        tmpdir_fastq = P.getTempDir()

        # create temporary directory again for nodes
        statement = [ "mkdir -p %s" % tmpdir_fastq ]
        fastqfiles = []

        # get track by extension of outfile
        track = os.path.splitext( os.path.basename( outfile ) )[0]

        if self.compress:
            compress_cmd = "| gzip"
            extension = ".gz"
        else:
            compress_cmd = ""
            extension = ""

        for infile in infiles:

            if infile.endswith( ".export.txt.gz"):
                # single end illumina export
                statement.append( """gunzip < %(infile)s 
                     | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \
                        { if ($1 != "") 
                             { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);}
                        else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); }
                       printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}'
                     %(compress_cmd)s
                     > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) )
            elif infile.endswith( ".fa.gz" ):
                statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() )
                fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) )
                self.datatype = "fasta"
                
            elif infile.endswith( ".sra"):
                # sneak preview to determine if paired end or single end
                outdir = P.getTempDir()
                # --split-files is present in fastq-dump 2.1.7
                P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() )
                # --split-files will create files called prefix_#.fastq.gz
                # where # is the read number. 
                # The following cases are:

                # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                #    * special case: unpaired reads in a paired end run end up in prefix.fastq.gz
                #    * special case: if paired reads are stored in a single read, fastq-dump will split.
                #       There might be a joining sequence. The output would thus be:
                #       prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz
                #      You want files 1 and 3.
                f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) ))
                ff = [ os.path.basename(x) for x in f ]
                if len(f) == 1: 
                    # sra file contains one read: output = prefix.fastq.gz
                    pass
                elif len(f) == 2:
                    # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                    assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" )
                elif len(f) == 3:
                    if ff[2].endswith( "_3.fastq.gz"):
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                    else:
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                E.info("sra file contains the following files: %s" % f )
                shutil.rmtree( outdir )
                fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] )
                statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() )
                
            elif infile.endswith( ".fastq.gz" ):
                format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False)
                if 'sanger' not in format and self.convert:
                    statement.append(  """gunzip < %(infile)s 
                                      | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, ) )

            elif infile.endswith( ".csfasta.gz" ):
                # single end SOLiD data
                if self.preserve_colourspace:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"
                    if not os.path.exists( quality ):
                        raise ValueError( "no quality file for %s" % infile )
                    statement.append(  """gunzip < %(infile)s 
                                          > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() )
                    statement.append(  """gunzip < %(quality)s 
                                          > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ),
                                        "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )

            elif infile.endswith( ".csfasta.F3.gz" ):
                # paired end SOLiD data
                if self.preserve_colourspace:
                    bn = P.snip( infile, ".csfasta.F3.gz" )
                    # order is important - mirrors tophat reads followed by quals
                    f = []
                    for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ):
                        fn = "%(bn)s.%(suffix)s" % locals()
                        if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn )
                        statement.append( """gunzip < %(fn)s.gz
                                          %(compress_cmd)s
                                          > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() )
                        f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() )
                    fastqfiles.append( f )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                

            elif infile.endswith( ".fastq.1.gz" ):

                bn = P.snip( infile, ".fastq.1.gz" )
                infile2 = "%s.fastq.2.gz" % bn
                if not os.path.exists( infile2 ):
                    raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile))
                
                format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False )
                if 'sanger' not in format:
                    statement.append( """gunzip < %(infile)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s;
                                     gunzip < %(infile2)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s
                                 """ % locals() )
                    fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension),
                                        "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) )

                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, 
                                        infile2, ) )
                    
            else:
                raise NotImplementedError( "unknown file format %s" % infile )

        
        self.tmpdir_fastq = tmpdir_fastq

        assert len(fastqfiles) > 0, "no fastq files for mapping"

        return "; ".join( statement) + ";", fastqfiles
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('/ifs/scratch')
    job_options = getGATKOptions()
    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]
    threads = PARAMS["gatk_threads"]
    dbsnp = PARAMS["gatk_dbsnp"]
    solid_options = PARAMS["gatk_solid_options"]

    # need to unload java before runnning GATK as it now runs on java version 7
    # full path to .jar file being specified as using module "GenomeAnalysisTK"
    # resulted in error: "Could not find the main class:
    # org.broadinstitute.sting.gatk.CommandLineGATK. Program will exit."
    # This error is seen when java version 6 is used
    # Find out why this error occurs when not specifying full path

    statement = '''module unload apps/java/jre1.6.0_26; checkpoint;'''
    statement += '''ReorderSam INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ;
                    checkpoint ;''' % locals()
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                    checkpoint ;''' % locals()
    statement += '''AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;
                    checkpoint ;''' % locals()
    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T RealignerTargetCreator
                    -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T IndelRealigner
                    -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/%(track)s.readgroups.bam
                    -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    checkpoint ;''' % locals()
    statement += '''java -Xmx4g -jar
                    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %%(bwa_index_dir)s/%%(genome)s.fa
                    -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ;
                    checkpoint ;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;'''
    P.run()
    def build(self, infile):
        '''
        build statement for running SGA
        '''
        track = self.getTrack(os.path.basename(infile))

        # decide which algorithm to use based on
        # read length
        if "%(sga_long)s":
            index_algorithm = "sais"
        else:
            index_algorithm = "ropebwt"

        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        # directory in which to do the assembly
        tempdir = P.getTempDir(dir=".")

        # check whether the data are paired-end
        if not paired:
            pe_mode = "--pe-mode=0"
            files = os.path.abspath(infile)
        else:
            # DOESN'T DEAL WITH INTERLEAVED FILES YET
            pe_mode = "--pe-mode=1"
            files = " ".join([os.path.abspath(infile), os.path.abspath(paired[1])])

        executable = "%(sga_executable)s"

        outdir = os.path.abspath("sga.dir")
        ###############################################
        # preprocessing step converts missing bases to
        # random bases or removes sequences with
        # missing bases
        ###############################################
        preprocess_options = "%(sga_preprocess_options)s"
        # outputs a merged fastq file
        outf_preprocessed = track + ".fastq"
        preprocess_statement = "cd %(tempdir)s; %(executable)s preprocess %(pe_mode)s %(preprocess_options)s %(files)s \
                                 -o %(outf_preprocessed)s 2> %(outdir)s/%(track)s_preprocess.log"

        ###############################################
        # indexing reads with FM index
        ###############################################
        index_options = "%(sga_index_options)s"
        index_statement = "%(executable)s index --algorithm=%(index_algorithm)s \
                           %(outf_preprocessed)s 2> %(outdir)s/%(track)s_index.log"

        ###############################################
        # correct sequencing errors in reads
        ###############################################
        correction_method = "%(sga_correction_method)s"

        # if correction_method == "kmer":
        # ADD WARNING HERE
        correction_options = "%(sga_kmer_correction_options)s"
        # elif correction_method == "hybrid":
        #     correction_options = "%(sga_hybrid_correction_options)s"
        # elif correction_method == "overlap":
        #     correction_options = "%(sga_overlap_correction_options)s"
        # else:
        #     raise ValueError("method %s does not exist: choose one of kmer, hybrid, overlap" % correction_method)
        outf_corrected = track + "_corrected.fa"
        metrics = "--metrics=%(track)s.metrics" % locals()
        correction_prefix = os.path.join(
            tempdir, P.snip(outf_corrected, ".fa"))
        correction_statement = "%(executable)s correct %(metrics)s  \
                                --algorithm=%(correction_method)s \
                                %(correction_options)s \
                                %(outf_preprocessed)s \
                                -o %(outf_corrected)s 2> %(outdir)s/%(track)s_corrected.log"

        ###############################################
        # filter low quality reads and low abundance
        # kmers
        ###############################################
        filter_options = "%(sga_filter_options)s"
        outf_filtered = track + "_filtered.fa"
        filter_statement = "sga index %(outf_corrected)s; \
                            %(executable)s filter %(filter_options)s  \
                            -o %(outf_filtered)s \
                            %(outf_corrected)s 2> %(outdir)s/%(track)s_filtered.log"

        ###############################################
        # overlap reads
        ###############################################
        # Note "asqg" is the default output from sga
        outf_overlap = track + "_filtered.asqg.gz"
        threads = "%(sga_threads)s"
        overlap_options = "%(sga_overlap_options)s"
        overlap_statement = "%(executable)s overlap %(overlap_options)s \
                             %(outf_filtered)s 2> %(outdir)s/%(track)s_overlap.log"

        ###############################################
        # assemble reads and perform error removal
        ###############################################
        assembly_options = "%(sga_assembly_options)s"
        error_removal_options = "%(sga_error_removal_options)s"
        out_prefix = track
        assembly_statement = "%(executable)s assemble %(assembly_options)s \
                              %(outf_overlap)s \
                              --out-prefix=%(out_prefix)s 2> %(outdir)s/%(track)s_contigs.log"

        ###############################################
        # build statement
        ###############################################
        metrics_file = os.path.basename(metrics.replace("--metrics=", ""))
        contigs_file = os.path.basename(out_prefix + "-contigs.fa")
        move_statement = "mv %(metrics_file)s %(outdir)s/%(metrics_file)s; \
                          cat %(contigs_file)s \
                          | python %%(scriptsdir)s/rename_contigs.py \
                          --log=%(outdir)s/%(track)s.contigs.log \
                          -a sga > %(outdir)s/%(track)s.contigs.fa"

        statement = "; ".join([preprocess_statement,
                               index_statement,
                               correction_statement,
                               filter_statement,
                               overlap_statement,
                               assembly_statement,
                               move_statement,
                               "rm -rf %(tempdir)s"]) % locals()

        return statement
Beispiel #44
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = [
        "samtools sort @IN@ @OUT@",
    ]

    # remove unmapped reads
    statement.append("python %(scriptsdir)s/bam2bam.py"
                     " --filter=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("python %(scriptsdir)s/bam2bam.py"
                         " --filter=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_options = "-l mem_free=10G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)
Beispiel #45
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    to_cluster = True

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile): os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Beispiel #46
0
    def build(self, infile):
        '''
        build statement for running SGA
        '''
        track = self.getTrack(os.path.basename(infile))

        # decide which algorithm to use based on
        # read length
        if "%(sga_long)s":
            index_algorithm = "sais"
        else:
            index_algorithm = "ropebwt"

        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        # directory in which to do the assembly
        tempdir = P.getTempDir(dir=".")

        # check whether the data are paired-end
        if not paired:
            pe_mode = "--pe-mode=0"
            files = os.path.abspath(infile)
        else:
            # DOESN'T DEAL WITH INTERLEAVED FILES YET
            pe_mode = "--pe-mode=1"
            files = " ".join([os.path.abspath(infile), os.path.abspath(paired[1])])

        executable = "%(sga_executable)s"

        outdir = os.path.abspath("sga.dir")
        ###############################################
        # preprocessing step converts missing bases to
        # random bases or removes sequences with
        # missing bases
        ###############################################
        preprocess_options = "%(sga_preprocess_options)s"
        # outputs a merged fastq file
        outf_preprocessed = track + ".fastq"
        preprocess_statement = "cd %(tempdir)s; %(executable)s preprocess %(pe_mode)s %(preprocess_options)s %(files)s \
                                 -o %(outf_preprocessed)s 2> %(outdir)s/%(track)s_preprocess.log"

        ###############################################
        # indexing reads with FM index
        ###############################################
        index_options = "%(sga_index_options)s"
        index_statement = "%(executable)s index --algorithm=%(index_algorithm)s \
                           %(outf_preprocessed)s 2> %(outdir)s/%(track)s_index.log"

        ###############################################
        # correct sequencing errors in reads
        ###############################################
        correction_method = "%(sga_correction_method)s"

        # if correction_method == "kmer":
        # ADD WARNING HERE
        correction_options = "%(sga_kmer_correction_options)s"
        # elif correction_method == "hybrid":
        #     correction_options = "%(sga_hybrid_correction_options)s"
        # elif correction_method == "overlap":
        #     correction_options = "%(sga_overlap_correction_options)s"
        # else:
        #     raise ValueError("method %s does not exist: choose one of kmer, hybrid, overlap" % correction_method)
        outf_corrected = track + "_corrected.fa"
        metrics = "--metrics=%(track)s.metrics" % locals()
        correction_prefix = os.path.join(
            tempdir, P.snip(outf_corrected, ".fa"))
        correction_statement = "%(executable)s correct %(metrics)s  \
                                --algorithm=%(correction_method)s \
                                %(correction_options)s \
                                %(outf_preprocessed)s \
                                -o %(outf_corrected)s 2> %(outdir)s/%(track)s_corrected.log"

        ###############################################
        # filter low quality reads and low abundance
        # kmers
        ###############################################
        filter_options = "%(sga_filter_options)s"
        outf_filtered = track + "_filtered.fa"
        filter_statement = "sga index %(outf_corrected)s; \
                            %(executable)s filter %(filter_options)s  \
                            -o %(outf_filtered)s \
                            %(outf_corrected)s 2> %(outdir)s/%(track)s_filtered.log"

        ###############################################
        # overlap reads
        ###############################################
        # Note "asqg" is the default output from sga
        outf_overlap = track + "_filtered.asqg.gz"
        threads = "%(sga_threads)s"
        overlap_options = "%(sga_overlap_options)s"
        overlap_statement = "%(executable)s overlap %(overlap_options)s \
                             %(outf_filtered)s 2> %(outdir)s/%(track)s_overlap.log"

        ###############################################
        # assemble reads and perform error removal
        ###############################################
        assembly_options = "%(sga_assembly_options)s"
        error_removal_options = "%(sga_error_removal_options)s"
        out_prefix = track
        assembly_statement = "%(executable)s assemble %(assembly_options)s \
                              %(outf_overlap)s \
                              --out-prefix=%(out_prefix)s 2> %(outdir)s/%(track)s_contigs.log"

        ###############################################
        # build statement
        ###############################################
        metrics_file = os.path.basename(metrics.replace("--metrics=", ""))
        contigs_file = os.path.basename(out_prefix + "-contigs.fa")
        move_statement = "mv %(metrics_file)s %(outdir)s/%(metrics_file)s; \
                          cat %(contigs_file)s \
                          | python %%(scriptsdir)s/rename_contigs.py \
                          --log=%(outdir)s/%(track)s.contigs.log \
                          -a sga > %(outdir)s/%(track)s.contigs.fa"

        statement = "; ".join([preprocess_statement,
                               index_statement,
                               correction_statement,
                               filter_statement,
                               overlap_statement,
                               assembly_statement,
                               move_statement,
                               "rm -rf %(tempdir)s"]) % locals()

        return statement
    def build(self, infile):
        '''
        build statement for running Ray
        '''
        track = self.getTrack(infile)

        format = self.getFormat(infile)
        paired = self.checkPairs(infile)

        tempdir = P.getTempDir()
        # check whether the data are paired-end
        if len(paired) > 1:
            pair = paired[0]
            # Ray doesn't like .fastq.1.gz etc
            read1 = infile
            read2 = paired[1]
            read1_new = os.path.join(tempdir,
                                     read1.replace(".fastq.1.gz", ".1.fastq"))
            read2_new = os.path.join(tempdir,
                                     read2.replace(".fastq.2.gz", ".2.fastq"))
            files = " ".join([read1_new, read2_new])
        else:
            pair = paired
            files = infile

        raydir = os.path.join(os.getcwd(), "ray.dir")

        # Ray picks up file types so should just have to
        # say whether its paired or not

        print files

        # build statement
        common_options = "-k %(kmer)s"
        if pair == "interleaved":
            filetype = "-i"
        elif not pair:
            filetype = "-s"
        elif pair == "separate":
            filetype = "-p"
        else:
            raise IOError, "do not support file of this type: %s" % infile

        statement = '''gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s 
                       ; %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s
                       ; checkpoint; mv %(raydir)s/Scaffolds.fa %(raydir)s/%(track)s.scaffolds.fa
                       ; mv %(raydir)s/ScaffoldComponents.txt %(raydir)s/%(track)s.scaffold_components.txt
                       ; mv %(raydir)s/ScaffoldLengths.txt %(raydir)s/%(track)s.scaffold_lengths.txt
                       ; mv %(raydir)s/ScaffoldLinks.txt %(raydir)s/%(track)s.scaffold_links.txt
                       ; mv %(raydir)s/Contigs.fa %(raydir)s/%(track)s.contigs.fa#
                       ; mv %(raydir)s/OutputNumbers.txt %(raydir)s/%(track)s.numbers.txt
                       ; mv %(raydir)s/CoverageDistribution.txt %(raydir)s/graph/%(track)s.coverage_distribution.txt
                       ; mkdir %(raydir)s/graph
                       ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir)s/graph/%(track)s.coverage_distribution_analysis.txt
                       ; mv %(raydir)s/degreeDistribution.txt %(raydir)s/graph/%(track)s.degree_distribution.txt
                       ; mv %(raydir)s/Kmers.txt %(raydir)s/graph/%(track)s.kmers.txt
                       ; mkdir %(raydir)s/assembly
                       ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir)s/assembly/%(track)s.seed_length_distribution.txt
                       ; mv %(raydir)s/LibraryStatistics.txt %(raydir)s/%(track)s.library_statistics.txt
                       ; mv %(raydir)s/LibraryData.xml %(raydir)s/%(track)s.library_data.xml 
                       ; rm -rf %(tempdir)s''' % locals()
        return statement