def main(): if len(sys.argv) != 5: stop_err("Wrong number of arguments. Expect: fasta tabular desrc_split [type]") input_filename = sys.argv[1] output_filename = sys.argv[2] descr_split = int( sys.argv[3] ) - 1 if descr_split < 0: stop_err("Bad description split value (should be 1 or more)") input_type = sys.argv[4] or 'sanger' #input type should ordinarily be unnecessary num_reads = None fastq_read = None out = open( output_filename, 'wb' ) if descr_split == 0: #Don't divide the description into multiple columns for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): out.write( "%s\t%s\t%s\n" % ( fastq_read.identifier[1:].replace( '\t', ' ' ), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) ) else: for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): words = fastq_read.identifier[1:].replace( '\t', ' ' ).split(None, descr_split) #pad with empty columns if required words += [""]*(descr_split-len(words)) out.write( "%s\t%s\t%s\n" % ("\t".join(words), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) ) out.close() if num_reads is None: print "No valid FASTQ reads could be processed." else: print "%i FASTQ reads were converted to Tabular." % ( num_reads + 1 )
def main(): mate1_filename = sys.argv[1] mate1_type = sys.argv[2] or 'sanger' mate2_filename = sys.argv[3] mate2_type = sys.argv[4] or 'sanger' outfile_pairs = sys.argv[5] outfile_singles = sys.argv[6] if mate1_type != mate2_type: print( "WARNING: You are trying to interlace files of two different types: %s and %s." % (mate1_type, mate2_type)) return type = mate1_type joiner = fastqJoiner(type) nof_singles = 0 nof_pairs = 0 i = None j = None out_pairs = fastqWriter(path=outfile_pairs, format=type) out_singles = fastqWriter(path=outfile_singles, format=type) mate2_input = fastqNamedReader(path=mate2_filename, format=type) mate1_input = fastqNamedReader(path=mate1_filename, format=type) reader1 = fastqReader(path=mate1_filename, format=type) reader2 = fastqReader(path=mate2_filename, format=type) with out_pairs, out_singles, mate2_input, mate1_input, reader1, reader2: # Pairs + singles present in mate1 for i, mate1 in enumerate(reader1): mate2 = mate2_input.get(joiner.get_paired_identifier(mate1)) if mate2: out_pairs.write(mate1) out_pairs.write(mate2) nof_pairs += 1 else: out_singles.write(mate1) nof_singles += 1 # Singles present in mate2 for j, mate2 in enumerate(reader2): mate1 = mate1_input.get(joiner.get_paired_identifier(mate2)) if not mate1: out_singles.write(mate2) nof_singles += 1 if (i is None) and (j is None): print("Your input files contained no valid FASTQ sequences.") else: print('There were %s single reads.' % (nof_singles)) print('Interlaced %s pairs of sequences.' % (nof_pairs))
def main(): mate1_filename = sys.argv[1] mate1_type = sys.argv[2] or 'sanger' mate2_filename = sys.argv[3] mate2_type = sys.argv[4] or 'sanger' outfile_pairs = sys.argv[5] outfile_singles = sys.argv[6] if mate1_type != mate2_type: print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type ) return type = mate1_type joiner = fastqJoiner( type ) out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type ) out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type ) # Pairs + singles present in mate1 nof_singles = 0 nof_pairs = 0 mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) i = None for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) ) if mate2: out_pairs.write( mate1 ) out_pairs.write( mate2 ) nof_pairs += 1 else: out_singles.write( mate1 ) nof_singles += 1 # Singles present in mate2 mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type ) j = None for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ): mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) ) if not mate1: out_singles.write( mate2 ) nof_singles += 1 if (i is None) and (j is None): print "Your input files contained no valid FASTQ sequences." else: print 'There were %s single reads.' % ( nof_singles ) print 'Interlaced %s pairs of sequences.' % ( nof_pairs ) mate1_input.close() mate2_input.close() out_pairs.close() out_singles.close()
def main(): #Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] if input1_type != input2_type: print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type) input2 = fastqNamedReader(open(input2_filename, 'rb'), input2_type) joiner = fastqJoiner(input1_type) out = fastqWriter(open(output_filename, 'wb'), format=input1_type) i = None skip_count = 0 for i, fastq_read in enumerate( fastqReader(open(input1_filename, 'rb'), format=input1_type)): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = input2.get(identifier) if fastq_paired is None: skip_count += 1 else: out.write(joiner.join(fastq_read, fastq_paired)) out.close() if i is None: print "Your file contains no valid FASTQ reads." else: print input2.has_data() print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0)
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type input = fastqNamedReader(open(input_filename, 'rb'), format=type) mate1_out = fastqWriter(open(mate1_filename, 'wb'), format=type) mate2_out = fastqWriter(open(mate2_filename, 'wb'), format=type) single1_out = fastqWriter(open(single1_filename, 'wb'), format=type) single2_out = fastqWriter(open(single2_filename, 'wb'), format=type) joiner = fastqJoiner(type) i = None skip_count = 0 found = {} for i, read in enumerate( fastqReader(open(input_filename, 'rb'), format=type)): if read.identifier in found: del found[read.identifier] continue mate1 = input.get(read.identifier) mate2 = input.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print "Your input file contained no valid FASTQ sequences." else: if skip_count: print 'There were %i reads with no mate.' % skip_count print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1) / 2) input.close() mate1_out.close() mate2_out.close() single1_out.close() single2_out.close()
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE)) i = None reads_manipulated = 0 writer = fastqWriter(path=output_filename, format=input_type) reader = fastqReader(path=input_filename, format=input_type) with writer, reader: for i, fastq_read in enumerate(reader): new_read = fastq_manipulator.match_and_manipulate_read(fastq_read) if new_read: writer.write(new_read) if new_read != fastq_read: reads_manipulated += 1 if i is None: print("Your file contains no valid FASTQ reads.") else: print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
def load_ids(filename, filetype): if filetype == "tabular": for line in open(filename): line = line.rstrip("\n") if line and not line.startswith("#"): yield line.split("\t", 1)[0] elif filetype == "fasta": for line in open(filename): if line.startswith(">"): yield line[1:].rstrip("\n").split(None, 1)[0] elif filetype.startswith("fastq"): # Use the Galaxy library not Biopython to cope with CS from galaxy_utils.sequence.fastq import fastqReader handle = open(filename, "rU") for record in fastqReader(handle): # The [1:] is because the fastaReader leaves the @ on the identifer. yield record.identifier.split()[0][1:] handle.close() elif filetype == "sff": try: from Bio.SeqIO import index except ImportError: sys.exit("Require Biopython 1.54 or later (to read SFF files)") # This will read the SFF index block if present (very fast) for name in index(filename, "sff"): yield name else: sys.exit("Unexpected file type %s" % filetype)
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] left_offset = sys.argv[3] right_offset = sys.argv[4] percent_offsets = sys.argv[5] == 'offsets_percent' input_type = sys.argv[6] or 'sanger' keep_zero_length = sys.argv[7] == 'keep_zero_length' out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) num_reads_excluded = 0 num_reads = None for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): if percent_offsets: left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) ) right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) ) else: left_column_offset = int( left_offset ) right_column_offset = int( right_offset ) if right_column_offset > 0: right_column_offset = -right_column_offset else: right_column_offset = None fastq_read = fastq_read.slice( left_column_offset, right_column_offset ) if keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if num_reads is None: print "No valid fastq reads could be processed." else: print "%i fastq reads were processed." % ( num_reads + 1 ) if num_reads_excluded: print "%i reads of zero length were excluded from the output." % num_reads_excluded
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None reads_kept = 0 for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): local = {'fastq_read':fastq_read, 'ret_val':False} execfile( script_filename, {}, local ) if local['ret_val']: out.write( fastq_read ) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) out = fastqWriter(open(output_filename, 'wb'), format=input_type) i = None reads_kept = 0 for i, fastq_read in enumerate( fastqReader(open(input_filename), format=input_type)): local = {'fastq_read': fastq_read, 'ret_val': False} execfile(script_filename, {}, local) if local['ret_val']: out.write(fastq_read) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0)
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. ## This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None reads_kept = 0 execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): ret_val = fastq_read_pass_filter( fastq_read ) ## fastq_read_pass_filter defined in script_filename if ret_val: out.write( fastq_read ) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) # Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. # This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter(path=output_filename, format=input_type) i = None reads_kept = 0 execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader(path=input_filename, format=input_type)): ret_val = fastq_read_pass_filter( fastq_read ) # fastq_read_pass_filter defined in script_filename # NOQA if ret_val: out.write(fastq_read) reads_kept += 1 out.close() if i is None: print("Your file contains no valid fastq reads.") else: print('Kept %s of %s reads (%.2f%%).' % (reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0))
def test_invalid_header(self): i_path = _data_path('fastqreader_min_invalid-header') reader = fastqReader(path=i_path) with self.assertRaises(fastqFormatError): for _ in reader: pass
def main(): #Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] if input1_type != input2_type: print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) joiner = fastqJoiner( input1_type ) out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) i = None skip_count = 0 for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): identifier = joiner.get_paired_identifier( fastq_read ) fastq_paired = input2.get( identifier ) if fastq_paired is None: skip_count += 1 else: out.write( joiner.join( fastq_read, fastq_paired ) ) out.close() if i is None: print "Your file contains no valid FASTQ reads." else: print input2.has_data() print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
def main(): # Read command line arguments input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' output1_filename = sys.argv[3] output2_filename = sys.argv[4] splitter = fastqSplitter() out1 = fastqWriter(path=output1_filename, format=input_type) out2 = fastqWriter(path=output2_filename, format=input_type) i = None skip_count = 0 for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): read1, read2 = splitter.split(fastq_read) if read1 and read2: out1.write(read1) out2.write(read2) else: skip_count += 1 out1.close() out2.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print('Split %s of %s reads (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser(usage=usage) parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'solexa', 'illumina', 'sanger.gz', 'solexa.gz', 'illumina.gz', 'sanger.bz2', 'solexa.bz2', 'illumina.bz2'), help='FASTQ variant type') parser.add_option('-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use') parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'), help='Mask base when score is') parser.add_option('-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score') parser.add_option("-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Need to specify an input file and an output file") score_comparer = get_score_comparer(options.score_comparison) if options.lowercase: base_masker = str.lower else: base_masker = BaseReplacer(options.mask_character) out = fastqWriter(path=args[1], format=options.format) num_reads = None for num_reads, fastq_read in enumerate(fastqReader(path=args[0], format=options.format)): sequence_list = list(fastq_read.sequence) for i, quality_score in enumerate(fastq_read.get_decimal_quality_scores()): if score_comparer(quality_score, options.quality_score): sequence_list[i] = base_masker(sequence_list[i]) fastq_read.sequence = "".join(sequence_list) out.write(fastq_read) if num_reads is not None: print("Processed %i %s reads." % (num_reads + 1, options.format)) else: print("No valid FASTQ reads were provided.")
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE)) out = fastqWriter(path=output_filename, format=input_type) i = None reads_manipulated = 0 for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): new_read = fastq_manipulator.match_and_manipulate_read(fastq_read) if new_read: out.write(new_read) if new_read != fastq_read: reads_manipulated += 1 out.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
def test_fastq_reader_cleanup(): i_path = _data_path("sanger_full_range_original_sanger.fastqsanger") fh = open(i_path) with _new_argv([fh]): reader = fastqReader(fh) for _ in reader: pass assert (fh.closed)
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type input = fastqNamedReader(path=input_filename, format=type) mate1_out = fastqWriter(path=mate1_filename, format=type) mate2_out = fastqWriter(path=mate2_filename, format=type) single1_out = fastqWriter(path=single1_filename, format=type) single2_out = fastqWriter(path=single2_filename, format=type) joiner = fastqJoiner(type) i = None skip_count = 0 found = {} for i, read in enumerate(fastqReader(path=input_filename, format=type)): if read.identifier in found: del found[read.identifier] continue mate1 = input.get(read.identifier) mate2 = input.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print("Your input file contained no valid FASTQ sequences.") else: if skip_count: print('There were %i reads with no mate.' % skip_count) print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2)) input.close() mate1_out.close() mate2_out.close() single1_out.close() single2_out.close()
def test_file_closed_on_completion(self): i_path = _data_path('fastqreader_min') fh = open(i_path) reader = fastqReader(fh) for _ in reader: pass self.assertTrue(fh.closed, 'File should be closed after iteration compeletes')
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type joiner = fastqJoiner(type) i = None skip_count = 0 found = {} mate1_out = fastqWriter(path=mate1_filename, format=type) mate2_out = fastqWriter(path=mate2_filename, format=type) single1_out = fastqWriter(path=single1_filename, format=type) single2_out = fastqWriter(path=single2_filename, format=type) reader1 = fastqNamedReader(path=input_filename, format=type) reader2 = fastqReader(path=input_filename, format=type) with mate1_out, mate2_out, single1_out, single2_out, reader1, reader2: for i, read in enumerate(reader2): if read.identifier in found: del found[read.identifier] continue mate1 = reader1.get(read.identifier) mate2 = reader1.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print("Your input file contained no valid FASTQ sequences.") else: if skip_count: print('There were %i reads with no mate.' % skip_count) print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))
def test_file_closed_on_line3_error(self): i_path = _data_path('fastqreader_min_invalid-line3') fh = open(i_path) with fastqReader(fh) as reader: with self.assertRaises(fastqFormatError): for _ in reader: pass self.assertTrue( fh.closed, 'File should be closed if exception occurs due to invalid line3')
def test_read_sequence_multiline(self): i_path = _data_path('fastqreader_min-multiline') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_seqs = 'ACGTACGTACGTACGTACGT', 'CATGCATGCATGCATGCATG' self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_seqs[0], rvals[0].get_sequence()) self.assertEqual(expected_seqs[1], rvals[1].get_sequence())
def test_read_sequence(self): i_path = _data_path('fastqreader_min') with fastqReader(path=i_path) as reader: rvals = [rval for rval in reader] expected_reads = 2 expected_seqs = 'ACGTACGTAC', 'CATGCATGCA' self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_seqs[0], rvals[0].get_sequence()) self.assertEqual(expected_seqs[1], rvals[1].get_sequence())
def test_read_qualityscores(self): i_path = _data_path('fastqreader_min') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_scores = '!##$%&&()*', '~}|{zyxwvu' self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_scores[0], rvals[0].quality) self.assertEqual(expected_scores[1], rvals[1].quality)
def test_read_qualityscores_multiline(self): i_path = _data_path('fastqreader_min-multiline') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_scores = '!##$%&&()**,-./01234', '~}|{zyxwvutsrqponmlk' self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_scores[0], rvals[0].quality) self.assertEqual(expected_scores[1], rvals[1].quality)
def fastq_filter(in_file, out_file, iterator_filter): count = 0 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) for record in iterator_filter(reader): count += 1 writer.write(record) writer.close() reader.close() return count
def test_read_header(self): i_path = _data_path('fastqreader_min') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_headers = '@FAKE-1', '@FAKE-2' self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_headers[0], rvals[0].identifier) self.assertEqual(expected_headers[1], rvals[1].identifier)
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' aggregator = fastqAggregator() num_reads = None fastq_read = None for num_reads, fastq_read in enumerate( fastqReader(open(input_filename), format=input_type)): aggregator.consume_read(fastq_read) out = open(output_filename, 'wb') valid_nucleotides = VALID_NUCLEOTIDES if fastq_read: if fastq_read.sequence_space == 'base': out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' ) else: out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' ) valid_nucleotides = VALID_COLOR_SPACE for i in range(aggregator.get_max_read_length()): column_stats = aggregator.get_summary_statistics_for_column(i) out.write('%i\t' % (i + 1)) out.write('%s\t' * len(SUMMARY_STAT_ORDER) % tuple([column_stats[key] for key in SUMMARY_STAT_ORDER])) out.write('%s\t' % ','.join(map(str, column_stats['outliers']))) base_counts = aggregator.get_base_counts_for_column(i) for nuc in valid_nucleotides: out.write("%s\t" % base_counts.get(nuc, 0)) extra_nucs = sorted([ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ]) out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join( str(base_counts[nuc]) for nuc in extra_nucs))) out.close() if num_reads is None: print "No valid fastq reads could be processed." else: print "%i fastq reads were processed." % (num_reads + 1) print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join(aggregator.get_valid_formats()) or "None") ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr(ascii_range[0]), ord(ascii_range[0]), repr( ascii_range[1]), ord(ascii_range[1]) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1])
def test_read_qualityscores_edgecase_multiline(self): # Quality score input designed to confuse the parser i_path = _data_path('fastqreader_min-multiline-edgecase') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_scores = ('+##$%&&()*+##$%&&()*@,-./01234', '@}|{zyxwvu@}|{zyxwvu+srqponmlk') self.assertEqual(expected_reads, len(rvals)) self.assertEqual(expected_scores[0], rvals[0].quality) self.assertEqual(expected_scores[1], rvals[1].quality)
def test_read_line3(self): # Separate test case for line3 containing a copy of line1 i_path = _data_path('fastqreader_min-line3') reader = fastqReader(path=i_path) rvals = [rval for rval in reader] expected_reads = 2 expected_seqs = 'ACGTACGTAC', 'CATGCATGCA' expected_scores = '!##$%&&()*', '~}|{zyxwvu' self.assertEqual(expected_reads, len(rvals)) for i in range(len(rvals)): self.assertEqual(expected_scores[i], rvals[i].quality) self.assertEqual(expected_seqs[i], rvals[i].get_sequence())
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' # input type should ordinarily be unnecessary num_reads = None fastq_read = None out = fastaWriter(path=output_filename, format="fasta") for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): out.write(fastq_read) out.close() if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were converted to FASTA." % (num_reads + 1))
def test_invalid_line3_stripped(self): i_path = _data_path('fastqreader_min_invalid-line3') # fix_id=True: fix inconsistent identifiers (source: SRA data dumps)k reader = fastqReader(path=i_path, fix_id=True) rvals = [rval for rval in reader] expected_reads = 2 expected_seqs = 'ACGTACGTAC', 'CATGCATGCA' expected_scores = '!##$%&&()*', '~}|{zyxwvu' expected_line3 = '+' self.assertEqual(expected_reads, len(rvals)) for i in range(len(rvals)): self.assertEqual(expected_scores[i], rvals[i].quality) self.assertEqual(expected_seqs[i], rvals[i].get_sequence()) self.assertEqual(expected_line3, rvals[i].description)
def main(): # Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] fastq_style = sys.argv[6] or 'old' paste = sys.argv[7] or '' # -- if input1_type != input2_type: print( "WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type)) if fastq_style == 'new': sep = sniff_sep(input1_filename) joiner = FastqJoiner(input1_type, sep=sep, paste=paste) else: joiner = fq.fastqJoiner(input1_type, paste=paste) # -- i = None skip_count = 0 writer = fq.fastqWriter(path=output_filename, format=input1_type) reader1 = fq.fastqReader(path=input1_filename, format=input1_type) reader2 = fq.fastqNamedReader(path=input2_filename, format=input2_type) with writer, reader1, reader2: for i, fastq_read in enumerate(reader1): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = reader2.get(identifier) if fastq_paired is None: skip_count += 1 else: writer.write(joiner.join(fastq_read, fastq_paired)) # this indent is correct: we still need access to reader2 if i is None: print("Your file contains no valid FASTQ reads.") else: print(reader2.has_data()) print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[ 3] or 'sanger' #input type should ordinarily be unnecessary num_reads = None fastq_read = None out = fastaWriter(open(output_filename, 'wb')) for num_reads, fastq_read in enumerate( fastqReader(open(input_filename), format=input_type)): out.write(fastq_read) out.close() if num_reads is None: print "No valid FASTQ reads could be processed." else: print "%i FASTQ reads were converted to FASTA." % (num_reads + 1)
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' # input type should ordinarily be unnecessary num_reads = None fastq_read = None out = fastaWriter(path=output_filename, format="fasta") reader = fastqReader(path=input_filename, format=input_type) with reader: for num_reads, fastq_read in enumerate(reader): out.write(fastq_read) out.close() if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were converted to FASTA." % (num_reads + 1))
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' #input type should ordinarily be unnecessary renum = bool(int(sys.argv[4])) num_reads = None fastq_read = None #out = fastaWriter( open( output_filename, 'wb' ) ) out = open(output_filename, "w") for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): if not renum: out.write( "%s\t%s\t%s\n" % (fastq_read.identifier[1:], fastq_read.sequence, fastq_read.quality)) else: out.write( "%x\t%s\t%s\n" % (num_reads, fastq_read.sequence, fastq_read.quality)) out.close() if num_reads is None: print "No valid FASTQ reads could be processed." else: print "%i FASTQ reads were converted to SFQ." % ( num_reads + 1 )
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] left_offset = sys.argv[3] right_offset = sys.argv[4] percent_offsets = sys.argv[5] == 'offsets_percent' input_type = sys.argv[6] or 'sanger' keep_zero_length = sys.argv[7] == 'keep_zero_length' out = fastqWriter(path=output_filename, format=input_type) num_reads_excluded = 0 num_reads = None for num_reads, fastq_read in enumerate( fastqReader(path=input_filename, format=input_type)): if percent_offsets: left_column_offset = int( round(float(left_offset) / 100.0 * float(len(fastq_read)))) right_column_offset = int( round(float(right_offset) / 100.0 * float(len(fastq_read)))) else: left_column_offset = int(left_offset) right_column_offset = int(right_offset) if right_column_offset != 0: right_column_offset = -right_column_offset else: right_column_offset = None fastq_read = fastq_read.slice(left_column_offset, right_column_offset) if keep_zero_length or len(fastq_read): out.write(fastq_read) else: num_reads_excluded += 1 out.close() if num_reads is None: print("No valid fastq reads could be processed.") else: print("%i fastq reads were processed." % (num_reads + 1)) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded)
def main(): # Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] fastq_style = sys.argv[6] or 'old' paste = sys.argv[7] or '' # -- if input1_type != input2_type: print("WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type)) if fastq_style == 'new': sep = sniff_sep(input1_filename) joiner = FastqJoiner(input1_type, sep=sep, paste=paste) else: joiner = fq.fastqJoiner(input1_type, paste=paste) # -- input2 = fq.fastqNamedReader(path=input2_filename, format=input2_type) out = fq.fastqWriter(path=output_filename, format=input1_type) i = None skip_count = 0 for i, fastq_read in enumerate(fq.fastqReader(path=input1_filename, format=input1_type)): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = input2.get(identifier) if fastq_paired is None: skip_count += 1 else: out.write(joiner.join(fastq_read, fastq_paired)) out.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print(input2.has_data()) print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] input_type = sys.argv[3] or 'sanger' aggregator = fastqAggregator() num_reads = None fastq_read = None for num_reads, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): aggregator.consume_read(fastq_read) out = open(output_filename, 'w') valid_nucleotides = VALID_NUCLEOTIDES if fastq_read: if fastq_read.sequence_space == 'base': out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n') else: out.write('#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n') valid_nucleotides = VALID_COLOR_SPACE for i in range(aggregator.get_max_read_length()): column_stats = aggregator.get_summary_statistics_for_column(i) out.write('%d\t' % (i + 1)) out.write("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\t%f\t%d\t%d\t" % tuple(column_stats[key] for key in SUMMARY_STAT_ORDER)) out.write('%s\t' % ','.join(map(str, column_stats['outliers']))) base_counts = aggregator.get_base_counts_for_column(i) for nuc in valid_nucleotides: out.write("%s\t" % base_counts.get(nuc, 0)) extra_nucs = sorted(nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides) out.write("%s\t%s\n" % (','.join(extra_nucs), ','.join(str(base_counts[nuc]) for nuc in extra_nucs))) out.close() if num_reads is None: print("No valid fastq reads could be processed.") else: print("%i fastq reads were processed." % (num_reads + 1)) print("Based upon quality values and sequence characters, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None")) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print("Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr(ascii_range[1]), ord(ascii_range[1]))) # print(using repr, since \x00 (null) causes info truncation in galaxy when printed) print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1]))
short_neg += 1 in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) #End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastqReader(in_handle) writer = fastqWriter(out_handle) if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: record.quality = record.quality[cut:] clipped += 1 writer.write(record) else: short_clipped += 1
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser( usage=usage ) parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") ( options, args ) = parser.parse_args() if len ( args ) != 2: parser.error( "Need to specify an input file and an output file" ) if options.window_size < 1: parser.error( 'You must specify a strictly positive window size' ) if options.window_step < 1: parser.error( 'You must specify a strictly positive step size' ) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range( min( options.exclude_count, options.window_size ) ): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range( options.window_size ): if window_index not in exclude_list: new_exclude = sorted( exclude_list + [ window_index ] ) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append( new_exclude ) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range( options.window_size ): last_exclude_indexes.append( [ window_index ] ) exclude_window_indexes = list( last_exclude_indexes ) out = fastqWriter( open( args[1], 'wb' ), format = options.format ) action = ACTION_METHODS[ options.aggregation_action ] num_reads = None num_reads_excluded = 0 for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): for trim_end in options.trim_ends: quality_list = fastq_read.get_decimal_quality_scores() if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len( quality_list ): fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( lwindow_position, None ) break lwindow_position += options.window_step else: rwindow_position = len( quality_list ) #right position of window while True: lwindow_position = rwindow_position - options.window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( None, rwindow_position ) break rwindow_position -= options.window_step if options.keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if num_reads is None: print "No valid FASTQ reads could be processed." else: print "%i FASTQ reads were processed." % ( num_reads + 1 ) if num_reads_excluded: print "%i reads of zero length were excluded from the output." % num_reads_excluded
writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: #Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] \ or seq_format.lower().startswith("qual"): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) marker = "@" else: sys.exit("Unsupported file type %r" % seq_format) #Now do the renaming count = 0 renamed = 0 for record in reader: #The [1:] is because the fastaReader leaves the > on the identifier, #likewise the fastqReader leaves the @ on the identifier try: idn, descr = record.identifier[1:].split(None, 1) except ValueError: idn = record.identifier[1:] descr = None
def main(): # Parse Command Line try: tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] except ValueError: stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) try: columns = [int(arg) - 1 for arg in cols_arg.split(",")] except ValueError: stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) if out_positive_file == "-" and out_negative_file == "-": stop_err("Neither output file requested") # Read tabular file and record all specified identifiers ids = set() handle = open(tabular_file, "rU") if len(columns) > 1: # General case of many columns for line in handle: if line.startswith("#"): # Ignore comments continue parts = line.rstrip("\n").split("\t") for col in columns: ids.add(parts[col]) print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) else: # Single column, special case speed up col = columns[0] for line in handle: if not line.startswith("#"): ids.add(line.rstrip("\n").split("\t")[col]) print "Using %i IDs from tabular file" % (len(ids)) handle.close() if seq_format.lower() == "sff": # Now write filtered SFF file based on IDs from BLAST file try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. if out_positive_file != "-": out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) out_handle.close() if out_negative_file != "-": out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. if out_positive_file != "-" and out_negative_file != "-": print "%i with and %i without specified IDs" % (pos_count, neg_count) elif out_positive_file != "-": print "%i with specified IDs" % pos_count elif out_negative_file != "-": print "%i without specified IDs" % neg_count elif seq_format.lower() == "fasta": # Write filtered FASTA file based on IDs from tabular file reader = fastaReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTA files" positive_writer = fastaWriter(open(out_positive_file, "w")) negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTA file" positive_writer = fastaWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTA file" negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() elif seq_format.lower().startswith("fastq"): # Write filtered FASTQ file based on IDs from tabular file from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTQ files" positive_writer = fastqWriter(open(out_positive_file, "w")) negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTQ file" positive_writer = fastqWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTQ file" negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() else: stop_err("Unsupported file type %r" % seq_format)
"@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") assert not re_illumina_r.match( "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 in_handle = open(input_fastq) if pairs_fastq: pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) pairs_r_writer = pairs_f_writer else: pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) singles_writer = fastqWriter(open(singles_fastq, "w"), format) last_template, buffered_reads = None, [] for record in fastqReader(in_handle, format): count += 1 name = record.identifier.split(None, 1)[0] assert name[0] == "@", record.identifier #Quirk of the Galaxy parser is_forward = False suffix = re_f.search(name) if suffix: #============ #Forward read #============ template = name[:suffix.start()] is_forward = True elif re_illumina_f.match(record.identifier): template = name #No suffix is_forward = True if is_forward:
count = 0 pairs = set() # Will this scale OK? forward = 0 reverse = 0 neither = 0 out_pairs = open(output_pairs, "w") out_nonpairs = open(output_nonpairs, "w") for input_fastq in input_fastq_filenames: if not os.path.isfile(input_fastq): sys.exit("Missing input FASTQ file %r" % input_fastq) in_handle = open(input_fastq) # Don't care about the FASTQ type really... for record in fastqReader(in_handle, "sanger"): count += 1 name = record.identifier.split(None, 1)[0] assert name[0] == "@", record.identifier # Quirk of the Galaxy parser name = name[1:] is_forward = False suffix = re_f.search(name) if suffix: # ============ # Forward read # ============ template = name[: suffix.start()] is_forward = True elif re_illumina_f.match(record.identifier): template = name # No suffix is_forward = True