Ejemplo n.º 1
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options
        threads = self.threads

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''pandaseq -f %(infile1)s -r %(infile2)s
        %(processing_options)s
        -T %(threads)i
        -U >(gzip > %(outfile)s.unpaired.gz)
        -w >(gzip > %(outfile)s)
        -F
        -G %(output_prefix)s-pandaseq.log.bgz;
        >& %(output_prefix)s-pandaseq.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        ''' % locals()

        return cmd
Ejemplo n.º 2
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        r = {33: 'sanger', 64: 'illumina', 59: 'solexa'}
        quality = r[offset]

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            cmd = '''sickle se
            -g %(processing_options)s
            --qual-type %(quality)s
            --output-file %(outfile)s
            --fastq-file %(infile)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            cmd = '''sickle pe
            -g -s %(processing_options)s
            --qual-type %(quality)s
            -f %(infile1)s -r %(infile2)s
            -o %(outfile1)s -p %(outfile2)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        return cmd
Ejemplo n.º 3
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
Ejemplo n.º 4
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''flash %(infile1)s %(infile2)s
        -p %(offset)s
        %(processing_options)s
        -o %(track)s
        -d %(outdir)s
        >& %(output_prefix)s-flash.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        mv %(outdir)s/%(track)s.extendedFrags.fastq.gz %(outfile)s;
        ''' % locals()

        return cmd
Ejemplo n.º 5
0
    def build(self, infiles, outfiles, output_prefix):

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        outdir = os.path.join(output_prefix + ".dir")
        track = os.path.basename(output_prefix)

        processing_options = self.processing_options
        threads = self.threads

        infile1, infile2 = infiles
        outfile = outfiles[0]

        cmd = '''pandaseq -f %(infile1)s -r %(infile2)s
        %(processing_options)s
        -T %(threads)i
        -U >(gzip > %(outfile)s.unpaired.gz)
        -w >(gzip > %(outfile)s)
        -F
        -G %(output_prefix)s-pandaseq.log.bgz;
        >& %(output_prefix)s-pandaseq.log;
        checkpoint;
        gzip %(outdir)s/*;
        checkpoint;
        ''' % locals()

        return cmd
Ejemplo n.º 6
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        r = {33: 'sanger', 64: 'illumina', 59: 'solexa'}
        quality = r[offset]

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            cmd = '''sickle se
            -g %(processing_options)s
            --qual-type %(quality)s
            --output-file %(outfile)s
            --fastq-file %(infile)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            cmd = '''sickle pe
            -g -s %(processing_options)s
            --qual-type %(quality)s
            -f %(infile1)s -r %(infile2)s
            -o %(outfile1)s -p %(outfile2)s
            2>>%(output_prefix)s.log
            ;''' % locals()

        return cmd
Ejemplo n.º 7
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        assert len(infiles) == len(outfiles)

        cmds = []
        for infile, outfile in zip(infiles, outfiles):

            cmds.append('''zcat %(infile)s
            | fastx_trimmer
            -Q%(offset)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            | gzip > %(outfile)s
            ;''' % locals())

        return " checkpoint; ".join(cmds)
Ejemplo n.º 8
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        prefix = self.prefix
        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        assert len(infiles) == len(outfiles)

        cmds = []
        for infile, outfile in zip(infiles, outfiles):

            cmds.append('''zcat %(infile)s
            | fastx_trimmer
            -Q%(offset)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            | gzip > %(outfile)s
            ;''' % locals())

        return " checkpoint; ".join(cmds)
Ejemplo n.º 9
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        threads = self.threads
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]

            cmd = '''trimmomatic SE
            -threads %(threads)i
            -phred%(offset)s
            %(infile)s %(outfile)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trimmomatic PE
            -threads %(threads)i
            -phred%(offset)s
            %(infile1)s %(infile2)s
            %(outfile1)s %(output_prefix)s.1.unpaired
            %(outfile2)s %(output_prefix)s.2.unpaired
            %(processing_options)s
            2>> %(output_prefix)s.log;
            checkpoint;
            gzip %(output_prefix)s.*.unpaired;
            ''' % locals()

        return cmd
Ejemplo n.º 10
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        threads = self.threads
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]

            cmd = '''trimmomatic SE
            -threads %(threads)i
            -phred%(offset)s
            %(infile)s %(outfile)s
            %(processing_options)s
            2>> %(output_prefix)s.log
            ;''' % locals()

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trimmomatic PE
            -threads %(threads)i
            -phred%(offset)s
            %(infile1)s %(infile2)s
            %(outfile1)s %(output_prefix)s.1.unpaired
            %(outfile2)s %(output_prefix)s.2.unpaired
            %(processing_options)s
            2>> %(output_prefix)s.log;
            checkpoint;
            gzip %(output_prefix)s.*.unpaired;
            ''' % locals()

        return cmd
Ejemplo n.º 11
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options
        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            outdir = os.path.dirname(outfile)
            trim_out = "%s/%s_trimmed.fq.gz" % (
                outdir, infile.replace(".fastq.gz", ""))
            cmd = '''trim_galore %(processing_options)s
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile)s
            2>>%(output_prefix)s.log;
            mv %(trim_out)s %(outfile)s;
            ''' % locals()
            outfiles = (outfile, )

        elif len(infiles) == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles
            outdir = os.path.dirname(outfile1)
            cmd = '''trim_galore %(processing_options)s
            --paired
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile1)s %(infile2)s
            2>>%(output_prefix)s.log;
            mv %(outdir)s/%(infile1)s_val_1.fq.gz %(outfile1)s;
            mv %(outdir)s/%(infile2)s_val_2.fq.gz %(outfile2)s;
            ''' % locals()

        return cmd
Ejemplo n.º 12
0
    def build(self, infiles, outfiles, output_prefix):

        assert len(infiles) == len(outfiles)
        assert len(infiles) in (1, 2)

        offset = Fastq.getOffset("sanger", raises=False)
        processing_options = self.processing_options

        if len(infiles) == 1:
            infile = infiles[0]
            outfile = outfiles[0]
            trim_out = "%s_trimmed.fq.gz" % (output_prefix)
            cmd = '''trim_galore %(processing_options)s
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile)s
            2>>%(output_prefix)s.log;
            mv %(trim_out)s %(outfile)s;
            ''' % locals()
            outfiles = (outfile,)

        elif self.num_files == 2:
            infile1, infile2 = infiles
            outfile1, outfile2 = outfiles

            cmd = '''trim_galore %(processing_options)s
            --paired
            --phred%(offset)s
            --output_dir %(outdir)s
            %(infile1)s %(infile2)s
            2>>%(output_prefix)s.log;
            mv %(infile1)s_val_1.fq.gz %(outfile1)s;
            mv %(infile2)s_val_2.fq.gz %(outfile2)s;
            ''' % locals()

        return cmd
Ejemplo n.º 13
0
def processReads(infiles, outfile):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs(infile)

    if infile2:
        track = P.snip(outfile, ".fastq.1.gz")
        outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz"
    else:
        track = P.snip(outfile, ".fastq.gz")

    if PARAMS["process_combine_reads"]:
        E.warn(
            "combining reads cannot be can not be combined with other processing for paired ended reads"
        )
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn(
                "if specifying --max-overlap read and fragment length options will be ignored"
            )
            max_overlap = "--max-overlap=%i" % PARAMS[
                "combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(
                fragment_options.strip().split(" ")) < 3:
            E.warn(
                "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used"
            )
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS[
                "combine_reads_fragment_length"] and PARAMS[
                    "combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn(
                    "--max-overlap will override the specified read and fragment length options"
                )
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals(
            )
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS[
                "combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS[
                "combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[
                "combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([
                track + x for x in [
                    ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz",
                    ".extendedFrags.fastq.gz"
                ]
            ])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return

    if PARAMS["process_sample"] and infile2:
        E.warn(
            "sampling can not be combined with other processing for paired ended reads"
        )
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat(IOTools.openFile(infile), raises=False)
    E.info("%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset(format, raises=False)

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
        #              %(contamination_trim_type)s
        s = [
            '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        '''
        ]
        do_sth = True
    else:
        s = ['zcat %(infile)s']

    if PARAMS["process_artifacts"]:
        s.append(
            'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log'
        )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append(
            'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append(
            'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append(
            'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log'
        )
        do_sth = True

    if PARAMS["process_sample"]:
        s.append(
            'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log'
        )

    if not do_sth:
        E.warn("no filtering specified for %s - nothing done" % infile)
        return

    s.append("gzip")
    if not infile2:
        statement = " | ".join(s) + " > %(outfile)s"
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn("processing first of pair")
        # first read pair
        statement = " | ".join(s) + " > %(tmpfile1)s"
        P.run()

        # second read pair
        E.warn("processing second of pair")
        infile = infile2
        statement = " | ".join(s) + " > %(tmpfile2)s"
        P.run()

        # reconcile
        E.info("starting reconciliation")
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""

        P.run()

        os.unlink(tmpfile1)
        os.unlink(tmpfile2)
        os.unlink(tmpfile)
Ejemplo n.º 14
0
 def setfastqAttr(self, infiles):
     self.offset = Fastq.getOffset(self.f_format, raises=False)
Ejemplo n.º 15
0
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_combine_reads"]:
        E.warn("combining reads cannot be can not be combined with other processing for paired ended reads")
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn("if specifying --max-overlap read and fragment length options will be ignored")
            max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3:
            E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used")
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn("--max-overlap will override the specified read and fragment length options")
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals() 
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([track + x for x in  [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True


    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )
Ejemplo n.º 16
0
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True

    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%i.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )