Ejemplo n.º 1
0
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters:

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.
    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
Ejemplo n.º 2
0
    def build(self, infile, outfile, processer_list):
        '''run mapper.'''
        f_format = Fastq.guessFormat(
            IOTools.openFile(infile[0], "r"), raises=False)

        cmd_process, cmd_post, processed_files = self.process(
            infile[0], processer_list, outfile, f_format, save=self.save)
        cmd_clean = self.cleanup(outfile)

        assert cmd_process.strip().endswith(";")
        assert cmd_post.strip().endswith(";")
        assert cmd_clean.strip().endswith(";")

        statement = " checkpoint; ".join((cmd_process,
                                          cmd_post,
                                          cmd_clean))
        return statement
Ejemplo n.º 3
0
Archivo: Sra.py Proyecto: BioXiao/cgat
def peek(sra, outdir):
    ''' returns the full file names for all files which will be extracted'''
    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(outdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    return f, fastq_format
Ejemplo n.º 4
0
Archivo: Sra.py Proyecto: ptdtan/cgat
def peek(sra, outdir=None):
    """return the full file names for all files which will be extracted

    Parameters
    ----------

    outdir : path
        perform extraction in outdir. If outdir is None, the extraction
        will take place in a temporary directory, which will be deleted
        afterwards.

    Returns
    -------
    files : list
        A list of fastq formatted files that are contained in the archive.
    format : string
        The quality score format in the :term:`fastq` formatted files.

    """
    
    if outdir is None:
        workdir = tempfile.mkdtemp()
    else:
        workdir = outdir

    # --split-files creates files called prefix_#.fastq.gz,
    # where # is the read number.
    # If file cotains paired end data:
    # output = prefix_1.fastq.gz, prefix_2.fastq.gz
    #    *special case: unpaired reads in a paired end --> prefix.fastq.gz
    #    *special case: if paired reads are stored in a single read,
    #                   fastq-dump will split. There might be a joining
    #                   sequence. The output would thus be:
    #                   prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz
    #                   You want files 1 and 3.

    E.run("""fastq-dump --split-files --gzip -X 1000
                 --outdir %(workdir)s %(sra)s""" % locals())
    f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz")))
    ff = [os.path.basename(x) for x in f]

    if len(f) == 1:
        # sra file contains one read: output = prefix.fastq.gz
        pass

    elif len(f) == 2:
        # sra file contains read pairs:
        # output = prefix_1.fastq.gz, prefix_2.fastq.gz
        assert ff[0].endswith(
            "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz")

    elif len(f) == 3:
        if ff[2].endswith("_3.fastq.gz"):
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))
        else:
            f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz"))

    # check format of fastqs in .sra
    fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False)

    if outdir is None:
        shutil.rmtree(workdir)

    return f, fastq_format
Ejemplo n.º 5
0
def processReads(infiles, outfile):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs(infile)

    if infile2:
        track = P.snip(outfile, ".fastq.1.gz")
        outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz"
    else:
        track = P.snip(outfile, ".fastq.gz")

    if PARAMS["process_combine_reads"]:
        E.warn(
            "combining reads cannot be can not be combined with other processing for paired ended reads"
        )
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn(
                "if specifying --max-overlap read and fragment length options will be ignored"
            )
            max_overlap = "--max-overlap=%i" % PARAMS[
                "combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(
                fragment_options.strip().split(" ")) < 3:
            E.warn(
                "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used"
            )
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS[
                "combine_reads_fragment_length"] and PARAMS[
                    "combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn(
                    "--max-overlap will override the specified read and fragment length options"
                )
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals(
            )
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS[
                "combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS[
                "combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[
                "combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([
                track + x for x in [
                    ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz",
                    ".extendedFrags.fastq.gz"
                ]
            ])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return

    if PARAMS["process_sample"] and infile2:
        E.warn(
            "sampling can not be combined with other processing for paired ended reads"
        )
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat(IOTools.openFile(infile), raises=False)
    E.info("%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset(format, raises=False)

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
        #              %(contamination_trim_type)s
        s = [
            '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        '''
        ]
        do_sth = True
    else:
        s = ['zcat %(infile)s']

    if PARAMS["process_artifacts"]:
        s.append(
            'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log'
        )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append(
            'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append(
            'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append(
            'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log'
        )
        do_sth = True

    if PARAMS["process_sample"]:
        s.append(
            'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log'
        )

    if not do_sth:
        E.warn("no filtering specified for %s - nothing done" % infile)
        return

    s.append("gzip")
    if not infile2:
        statement = " | ".join(s) + " > %(outfile)s"
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn("processing first of pair")
        # first read pair
        statement = " | ".join(s) + " > %(tmpfile1)s"
        P.run()

        # second read pair
        E.warn("processing second of pair")
        infile = infile2
        statement = " | ".join(s) + " > %(tmpfile2)s"
        P.run()

        # reconcile
        E.info("starting reconciliation")
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""

        P.run()

        os.unlink(tmpfile1)
        os.unlink(tmpfile2)
        os.unlink(tmpfile)
Ejemplo n.º 6
0
    def preprocess( self, infiles, outfile ):
        '''build preprocessing statement

        Build a command line statement that extracts/converts 
        various input formats to fastq formatted files.

        Mapping qualities are changed to solexa format.

        returns the statement and the fastq files to map.
        '''

        assert len(infiles) > 0, "no input files for mapping"

        tmpdir_fastq = P.getTempDir()

        # create temporary directory again for nodes
        statement = [ "mkdir -p %s" % tmpdir_fastq ]
        fastqfiles = []

        # get track by extension of outfile
        track = os.path.splitext( os.path.basename( outfile ) )[0]

        if self.compress:
            compress_cmd = "| gzip"
            extension = ".gz"
        else:
            compress_cmd = ""
            extension = ""

        for infile in infiles:

            if infile.endswith( ".export.txt.gz"):
                # single end illumina export
                statement.append( """gunzip < %(infile)s 
                     | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \
                        { if ($1 != "") 
                             { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);}
                        else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); }
                       printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}'
                     %(compress_cmd)s
                     > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) )
            elif infile.endswith( ".fa.gz" ):
                statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() )
                fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) )
                self.datatype = "fasta"
                
            elif infile.endswith( ".sra"):
                # sneak preview to determine if paired end or single end
                outdir = P.getTempDir()
                # --split-files is present in fastq-dump 2.1.7
                P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() )
                # --split-files will create files called prefix_#.fastq.gz
                # where # is the read number. 
                # The following cases are:

                # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                #    * special case: unpaired reads in a paired end run end up in prefix.fastq.gz
                #    * special case: if paired reads are stored in a single read, fastq-dump will split.
                #       There might be a joining sequence. The output would thus be:
                #       prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz
                #      You want files 1 and 3.
                f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) ))
                ff = [ os.path.basename(x) for x in f ]
                if len(f) == 1: 
                    # sra file contains one read: output = prefix.fastq.gz
                    pass
                elif len(f) == 2:
                    # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz
                    assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" )
                elif len(f) == 3:
                    if ff[2].endswith( "_3.fastq.gz"):
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                    else:
                        f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) )
                E.info("sra file contains the following files: %s" % f )
                shutil.rmtree( outdir )
                fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] )
                statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() )
                
            elif infile.endswith( ".fastq.gz" ):
                format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False)
                if 'sanger' not in format and self.convert:
                    statement.append(  """gunzip < %(infile)s 
                                      | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, ) )

            elif infile.endswith( ".csfasta.gz" ):
                # single end SOLiD data
                if self.preserve_colourspace:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"
                    if not os.path.exists( quality ):
                        raise ValueError( "no quality file for %s" % infile )
                    statement.append(  """gunzip < %(infile)s 
                                          > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() )
                    statement.append(  """gunzip < %(quality)s 
                                          > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ),
                                        "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )

            elif infile.endswith( ".csfasta.F3.gz" ):
                # paired end SOLiD data
                if self.preserve_colourspace:
                    bn = P.snip( infile, ".csfasta.F3.gz" )
                    # order is important - mirrors tophat reads followed by quals
                    f = []
                    for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ):
                        fn = "%(bn)s.%(suffix)s" % locals()
                        if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn )
                        statement.append( """gunzip < %(fn)s.gz
                                          %(compress_cmd)s
                                          > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() )
                        f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() )
                    fastqfiles.append( f )
                    self.datatype = "solid"
                else:
                    quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz"

                    statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s)
                                      %(compress_cmd)s
                                      > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() )
                    fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) )
                

            elif infile.endswith( ".fastq.1.gz" ):

                bn = P.snip( infile, ".fastq.1.gz" )
                infile2 = "%s.fastq.2.gz" % bn
                if not os.path.exists( infile2 ):
                    raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile))
                
                format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False )
                if 'sanger' not in format:
                    statement.append( """gunzip < %(infile)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s;
                                     gunzip < %(infile2)s 
                                     | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log
                                     %(compress_cmd)s
                                     > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s
                                 """ % locals() )
                    fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension),
                                        "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) )

                else:
                    E.debug( "%s: assuming quality score format %s" % (infile, format ) ) 
                    fastqfiles.append( (infile, 
                                        infile2, ) )
                    
            else:
                raise NotImplementedError( "unknown file format %s" % infile )

        
        self.tmpdir_fastq = tmpdir_fastq

        assert len(fastqfiles) > 0, "no fastq files for mapping"

        return "; ".join( statement) + ";", fastqfiles
Ejemplo n.º 7
0
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_combine_reads"]:
        E.warn("combining reads cannot be can not be combined with other processing for paired ended reads")
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn("if specifying --max-overlap read and fragment length options will be ignored")
            max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3:
            E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used")
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn("--max-overlap will override the specified read and fragment length options")
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals() 
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([track + x for x in  [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True


    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )
Ejemplo n.º 8
0
def processReads( infiles, outfile ):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs( infile )

    if infile2:
        track = P.snip( outfile, ".fastq.1.gz" )        
        outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz"
    else:
        track = P.snip( outfile, ".fastq.gz" )


    if PARAMS["process_sample"] and infile2:
        E.warn( "sampling can not be combined with other processing for paired ended reads")
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False)
    E.info( "%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset( format, raises = False )

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
#              %(contamination_trim_type)s
        s = [ '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        ''' ]
        do_sth = True
    else:
        s = ['zcat %(infile)s' ]

    if PARAMS["process_artifacts"]:
        s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append( 'fastq_quality_trimmer -Q %(offset)i  -v %(trim_options)s 2>> %(outfile)s_trim.log' )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log')
        do_sth = True

    if PARAMS["process_sample"]:
        s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' )

    if not do_sth:
        E.warn( "no filtering specified for %s - nothing done" % infile )
        return

    s.append( "gzip" )
    if not infile2:
        statement = " | ".join( s ) + " > %(outfile)s" 
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn( "processing first of pair")
        # first read pair
        statement = " | ".join( s ) + " > %(tmpfile1)s" 
        P.run()

        # second read pair        
        E.warn( "processing second of pair")
        infile = infile2
        statement = " | ".join( s ) + " > %(tmpfile2)s" 
        P.run()

        # reconcile
        E.info("starting reconciliation" )
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%i.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""
        
        P.run()

        os.unlink( tmpfile1 )
        os.unlink( tmpfile2 )
        os.unlink( tmpfile )