def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters: outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) if outdir is None: shutil.rmtree(workdir) return f, fastq_format
def build(self, infile, outfile, processer_list): '''run mapper.''' f_format = Fastq.guessFormat( IOTools.openFile(infile[0], "r"), raises=False) cmd_process, cmd_post, processed_files = self.process( infile[0], processer_list, outfile, f_format, save=self.save) cmd_clean = self.cleanup(outfile) assert cmd_process.strip().endswith(";") assert cmd_post.strip().endswith(";") assert cmd_clean.strip().endswith(";") statement = " checkpoint; ".join((cmd_process, cmd_post, cmd_clean)) return statement
def peek(sra, outdir): ''' returns the full file names for all files which will be extracted''' # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(outdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(outdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) return f, fastq_format
def peek(sra, outdir=None): """return the full file names for all files which will be extracted Parameters ---------- outdir : path perform extraction in outdir. If outdir is None, the extraction will take place in a temporary directory, which will be deleted afterwards. Returns ------- files : list A list of fastq formatted files that are contained in the archive. format : string The quality score format in the :term:`fastq` formatted files. """ if outdir is None: workdir = tempfile.mkdtemp() else: workdir = outdir # --split-files creates files called prefix_#.fastq.gz, # where # is the read number. # If file cotains paired end data: # output = prefix_1.fastq.gz, prefix_2.fastq.gz # *special case: unpaired reads in a paired end --> prefix.fastq.gz # *special case: if paired reads are stored in a single read, # fastq-dump will split. There might be a joining # sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz, prefix_3.fastq.gz # You want files 1 and 3. E.run("""fastq-dump --split-files --gzip -X 1000 --outdir %(workdir)s %(sra)s""" % locals()) f = sorted(glob.glob(os.path.join(workdir, "*.fastq.gz"))) ff = [os.path.basename(x) for x in f] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: # output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith("_2.fastq.gz") elif len(f) == 3: if ff[2].endswith("_3.fastq.gz"): f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) else: f = glob.glob(os.path.join(workdir, "*_[13].fastq.gz")) # check format of fastqs in .sra fastq_format = Fastq.guessFormat(IOTools.openFile(f[0], "r"), raises=False) if outdir is None: shutil.rmtree(workdir) return f, fastq_format
def processReads(infiles, outfile): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs(infile) if infile2: track = P.snip(outfile, ".fastq.1.gz") outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz" else: track = P.snip(outfile, ".fastq.gz") if PARAMS["process_combine_reads"]: E.warn( "combining reads cannot be can not be combined with other processing for paired ended reads" ) if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn( "if specifying --max-overlap read and fragment length options will be ignored" ) max_overlap = "--max-overlap=%i" % PARAMS[ "combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len( fragment_options.strip().split(" ")) < 3: E.warn( "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used" ) max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS[ "combine_reads_fragment_length"] and PARAMS[ "combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn( "--max-overlap will override the specified read and fragment length options" ) max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals( ) else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS[ "combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS[ "combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[ "combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([ track + x for x in [ ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz" ] ]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads" ) statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat(IOTools.openFile(infile), raises=False) E.info("%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset(format, raises=False) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s'] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log' ) do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn("no filtering specified for %s - nothing done" % infile) return s.append("gzip") if not infile2: statement = " | ".join(s) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn("processing first of pair") # first read pair statement = " | ".join(s) + " > %(tmpfile1)s" P.run() # second read pair E.warn("processing second of pair") infile = infile2 statement = " | ".join(s) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation") statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink(tmpfile1) os.unlink(tmpfile2) os.unlink(tmpfile)
def preprocess( self, infiles, outfile ): '''build preprocessing statement Build a command line statement that extracts/converts various input formats to fastq formatted files. Mapping qualities are changed to solexa format. returns the statement and the fastq files to map. ''' assert len(infiles) > 0, "no input files for mapping" tmpdir_fastq = P.getTempDir() # create temporary directory again for nodes statement = [ "mkdir -p %s" % tmpdir_fastq ] fastqfiles = [] # get track by extension of outfile track = os.path.splitext( os.path.basename( outfile ) )[0] if self.compress: compress_cmd = "| gzip" extension = ".gz" else: compress_cmd = "" extension = "" for infile in infiles: if infile.endswith( ".export.txt.gz"): # single end illumina export statement.append( """gunzip < %(infile)s | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \ { if ($1 != "") { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);} else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); } printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}' %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) ) elif infile.endswith( ".fa.gz" ): statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() ) fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) ) self.datatype = "fasta" elif infile.endswith( ".sra"): # sneak preview to determine if paired end or single end outdir = P.getTempDir() # --split-files is present in fastq-dump 2.1.7 P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() ) # --split-files will create files called prefix_#.fastq.gz # where # is the read number. # The following cases are: # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz # * special case: unpaired reads in a paired end run end up in prefix.fastq.gz # * special case: if paired reads are stored in a single read, fastq-dump will split. # There might be a joining sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz # You want files 1 and 3. f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) )) ff = [ os.path.basename(x) for x in f ] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" ) elif len(f) == 3: if ff[2].endswith( "_3.fastq.gz"): f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) else: f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) E.info("sra file contains the following files: %s" % f ) shutil.rmtree( outdir ) fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] ) statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() ) elif infile.endswith( ".fastq.gz" ): format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False) if 'sanger' not in format and self.convert: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, ) ) elif infile.endswith( ".csfasta.gz" ): # single end SOLiD data if self.preserve_colourspace: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" if not os.path.exists( quality ): raise ValueError( "no quality file for %s" % infile ) statement.append( """gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() ) statement.append( """gunzip < %(quality)s > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ), "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".csfasta.F3.gz" ): # paired end SOLiD data if self.preserve_colourspace: bn = P.snip( infile, ".csfasta.F3.gz" ) # order is important - mirrors tophat reads followed by quals f = [] for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ): fn = "%(bn)s.%(suffix)s" % locals() if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn ) statement.append( """gunzip < %(fn)s.gz %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() ) f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() ) fastqfiles.append( f ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".fastq.1.gz" ): bn = P.snip( infile, ".fastq.1.gz" ) infile2 = "%s.fastq.2.gz" % bn if not os.path.exists( infile2 ): raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile)) format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False ) if 'sanger' not in format: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s; gunzip < %(infile2)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s """ % locals() ) fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension), "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, infile2, ) ) else: raise NotImplementedError( "unknown file format %s" % infile ) self.tmpdir_fastq = tmpdir_fastq assert len(fastqfiles) > 0, "no fastq files for mapping" return "; ".join( statement) + ";", fastqfiles
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_combine_reads"]: E.warn("combining reads cannot be can not be combined with other processing for paired ended reads") if not infile2: raise IOError("must have paired data to combine reads") read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \ PARAMS["combine_reads_fragment_length"], \ PARAMS["combine_reads_fragment_length_stdev"] fragment_options = " ".join(map(str,[read_len, frag_len, frag_stdev])) if PARAMS["combine_reads_max_overlap"]: E.warn("if specifying --max-overlap read and fragment length options will be ignored") max_overlap="--max-overlap=%i" % PARAMS["combine_reads_max_overlap"] fragment_options = "" elif not PARAMS["combine_reads_max_overlap"] and len(fragment_options.strip().split(" ")) < 3: E.warn("have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used") max_overlap = "" fragment_options = "" elif PARAMS["combine_reads_read_length"] and PARAMS["combine_reads_fragment_length"] and PARAMS["combine_reads_fragment_length_stdev"]: if PARAMS["combine_reads_max_overlap"]: E.warn("--max-overlap will override the specified read and fragment length options") max_overlap = "" fragment_options = """--read-len=%(read_len)i --fragment-len=%(frag_len)i --fragment-len-stddev=%(frag_stdev)i""" % locals() else: max_overlap = "" fragment_options = "" if not PARAMS["combine_reads_min_overlap"]: min_overlap = "" else: min_overlap = "--min-overlap=%i" % PARAMS["combine_reads_min_overlap"] if not PARAMS["combine_reads_threads"]: threads = "" else: threads = "--threads=%i" % PARAMS["combine_reads_threads"] if not PARAMS["combine_reads_phred_offset"]: phred_offset = "" else: phred_offset = "--phred-offset=%i" % PARAMS["combine_reads_phred_offset"] if not PARAMS["combine_reads_max_mismatch_density"]: max_mismatch_density = "" else: max_mismatch_density = "--max-mismatch-density=%f" % PARAMS["combine_reads_max_mismatch_density"] statement = '''flash %(min_overlap)s %(max_overlap)s %(max_mismatch_density)s %(phred_offset)s %(fragment_options)s --output-prefix=%(track)s %(threads)s --compress %(infile)s %(infile2)s >> %(outfile)s.log ''' P.run() if PARAMS["combine_reads_concatenate"]: infiles = " ".join([track + x for x in [".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz", ".extendedFrags.fastq.gz"]]) statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s''' else: statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s''' P.run() return if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_quality_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%s.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )
def processReads( infiles, outfile ): '''process reads.''' infile, contaminant_file = infiles do_sth = False to_cluster = True infile2 = checkPairs( infile ) if infile2: track = P.snip( outfile, ".fastq.1.gz" ) outfile2 = P.snip( outfile, ".fastq.1.gz" ) + ".fastq.2.gz" else: track = P.snip( outfile, ".fastq.gz" ) if PARAMS["process_sample"] and infile2: E.warn( "sampling can not be combined with other processing for paired ended reads") statement = '''zcat %(infile)s | python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --pair=%(infile2)s --outfile-pair=%(outfile2)s --log=%(outfile)s_sample.log | gzip > %(outfile)s ''' P.run() return # fastx does not like quality scores below 64 (Illumina 1.3 format) # need to detect the scores and convert format = Fastq.guessFormat( IOTools.openFile(infile ) , raises = False) E.info( "%s: format guess: %s" % (infile, format)) offset = Fastq.getOffset( format, raises = False ) if PARAMS["process_remove_contaminants"]: adaptors = listAdaptors(contaminant_file) # %(contamination_trim_type)s s = [ ''' cutadapt %(adaptors)s --overlap=%(contamination_min_overlap_length)i --format=fastq %(contamination_options)s <( zcat < %(infile)s ) 2>> %(outfile)s_contaminants.log ''' ] do_sth = True else: s = ['zcat %(infile)s' ] if PARAMS["process_artifacts"]: s.append( 'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log' ) do_sth = True if PARAMS["process_trim"]: s.append( 'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True # NICK - may replace fastx trimmer if PARAMS["process_trim_quality"]: s.append( 'fastq_quality_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log' ) do_sth = True if PARAMS["process_filter"]: s.append( 'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log') do_sth = True if PARAMS["process_sample"]: s.append( 'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log' ) if not do_sth: E.warn( "no filtering specified for %s - nothing done" % infile ) return s.append( "gzip" ) if not infile2: statement = " | ".join( s ) + " > %(outfile)s" P.run() else: tmpfile = P.getTempFilename(".") tmpfile1 = tmpfile + ".fastq.1.gz" tmpfile2 = tmpfile + ".fastq.2.gz" E.warn( "processing first of pair") # first read pair statement = " | ".join( s ) + " > %(tmpfile1)s" P.run() # second read pair E.warn( "processing second of pair") infile = infile2 statement = " | ".join( s ) + " > %(tmpfile2)s" P.run() # reconcile E.info("starting reconciliation" ) statement = """python %(scriptsdir)s/fastqs2fastqs.py --method=reconcile --output-pattern=%(track)s.fastq.%%i.gz %(tmpfile1)s %(tmpfile2)s > %(outfile)s_reconcile.log""" P.run() os.unlink( tmpfile1 ) os.unlink( tmpfile2 ) os.unlink( tmpfile )