def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type input = fastqNamedReader(open(input_filename, 'rb'), format=type) mate1_out = fastqWriter(open(mate1_filename, 'wb'), format=type) mate2_out = fastqWriter(open(mate2_filename, 'wb'), format=type) single1_out = fastqWriter(open(single1_filename, 'wb'), format=type) single2_out = fastqWriter(open(single2_filename, 'wb'), format=type) joiner = fastqJoiner(type) i = None skip_count = 0 found = {} for i, read in enumerate( fastqReader(open(input_filename, 'rb'), format=type)): if read.identifier in found: del found[read.identifier] continue mate1 = input.get(read.identifier) mate2 = input.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print "Your input file contained no valid FASTQ sequences." else: if skip_count: print 'There were %i reads with no mate.' % skip_count print 'De-interlaced %s pairs of sequences.' % ( (i - skip_count + 1) / 2) input.close() mate1_out.close() mate2_out.close() single1_out.close() single2_out.close()
def main(): # Read command line arguments input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' output1_filename = sys.argv[3] output2_filename = sys.argv[4] splitter = fastqSplitter() out1 = fastqWriter(path=output1_filename, format=input_type) out2 = fastqWriter(path=output2_filename, format=input_type) i = None skip_count = 0 for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): read1, read2 = splitter.split(fastq_read) if read1 and read2: out1.write(read1) out2.write(read2) else: skip_count += 1 out1.close() out2.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print('Split %s of %s reads (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type input = fastqNamedReader(path=input_filename, format=type) mate1_out = fastqWriter(path=mate1_filename, format=type) mate2_out = fastqWriter(path=mate2_filename, format=type) single1_out = fastqWriter(path=single1_filename, format=type) single2_out = fastqWriter(path=single2_filename, format=type) joiner = fastqJoiner(type) i = None skip_count = 0 found = {} for i, read in enumerate(fastqReader(path=input_filename, format=type)): if read.identifier in found: del found[read.identifier] continue mate1 = input.get(read.identifier) mate2 = input.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print("Your input file contained no valid FASTQ sequences.") else: if skip_count: print('There were %i reads with no mate.' % skip_count) print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2)) input.close() mate1_out.close() mate2_out.close() single1_out.close() single2_out.close()
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] or 'sanger' mate1_filename = sys.argv[3] mate2_filename = sys.argv[4] single1_filename = sys.argv[5] single2_filename = sys.argv[6] type = input_type joiner = fastqJoiner(type) i = None skip_count = 0 found = {} mate1_out = fastqWriter(path=mate1_filename, format=type) mate2_out = fastqWriter(path=mate2_filename, format=type) single1_out = fastqWriter(path=single1_filename, format=type) single2_out = fastqWriter(path=single2_filename, format=type) reader1 = fastqNamedReader(path=input_filename, format=type) reader2 = fastqReader(path=input_filename, format=type) with mate1_out, mate2_out, single1_out, single2_out, reader1, reader2: for i, read in enumerate(reader2): if read.identifier in found: del found[read.identifier] continue mate1 = reader1.get(read.identifier) mate2 = reader1.get(joiner.get_paired_identifier(mate1)) if mate2: # This is a mate pair found[mate2.identifier] = None if joiner.is_first_mate(mate1): mate1_out.write(mate1) mate2_out.write(mate2) else: mate1_out.write(mate2) mate2_out.write(mate1) else: # This is a single skip_count += 1 if joiner.is_first_mate(mate1): single1_out.write(mate1) else: single2_out.write(mate1) if i is None: print("Your input file contained no valid FASTQ sequences.") else: if skip_count: print('There were %i reads with no mate.' % skip_count) print('De-interlaced %s pairs of sequences.' % ((i - skip_count + 1) / 2))
def main(): mate1_filename = sys.argv[1] mate1_type = sys.argv[2] or 'sanger' mate2_filename = sys.argv[3] mate2_type = sys.argv[4] or 'sanger' outfile_pairs = sys.argv[5] outfile_singles = sys.argv[6] if mate1_type != mate2_type: print( "WARNING: You are trying to interlace files of two different types: %s and %s." % (mate1_type, mate2_type)) return type = mate1_type joiner = fastqJoiner(type) nof_singles = 0 nof_pairs = 0 i = None j = None out_pairs = fastqWriter(path=outfile_pairs, format=type) out_singles = fastqWriter(path=outfile_singles, format=type) mate2_input = fastqNamedReader(path=mate2_filename, format=type) mate1_input = fastqNamedReader(path=mate1_filename, format=type) reader1 = fastqReader(path=mate1_filename, format=type) reader2 = fastqReader(path=mate2_filename, format=type) with out_pairs, out_singles, mate2_input, mate1_input, reader1, reader2: # Pairs + singles present in mate1 for i, mate1 in enumerate(reader1): mate2 = mate2_input.get(joiner.get_paired_identifier(mate1)) if mate2: out_pairs.write(mate1) out_pairs.write(mate2) nof_pairs += 1 else: out_singles.write(mate1) nof_singles += 1 # Singles present in mate2 for j, mate2 in enumerate(reader2): mate1 = mate1_input.get(joiner.get_paired_identifier(mate2)) if not mate1: out_singles.write(mate2) nof_singles += 1 if (i is None) and (j is None): print("Your input files contained no valid FASTQ sequences.") else: print('There were %s single reads.' % (nof_singles)) print('Interlaced %s pairs of sequences.' % (nof_pairs))
def main(): mate1_filename = sys.argv[1] mate1_type = sys.argv[2] or 'sanger' mate2_filename = sys.argv[3] mate2_type = sys.argv[4] or 'sanger' outfile_pairs = sys.argv[5] outfile_singles = sys.argv[6] if mate1_type != mate2_type: print "WARNING: You are trying to interlace files of two different types: %s and %s." % ( mate1_type, mate2_type ) return type = mate1_type joiner = fastqJoiner( type ) out_pairs = fastqWriter( open( outfile_pairs, 'wb' ), format = type ) out_singles = fastqWriter( open( outfile_singles, 'wb' ), format = type ) # Pairs + singles present in mate1 nof_singles = 0 nof_pairs = 0 mate2_input = fastqNamedReader( open( mate2_filename, 'rb' ), format = type ) i = None for i, mate1 in enumerate( fastqReader( open( mate1_filename, 'rb' ), format = type ) ): mate2 = mate2_input.get( joiner.get_paired_identifier( mate1 ) ) if mate2: out_pairs.write( mate1 ) out_pairs.write( mate2 ) nof_pairs += 1 else: out_singles.write( mate1 ) nof_singles += 1 # Singles present in mate2 mate1_input = fastqNamedReader( open( mate1_filename, 'rb' ), format = type ) j = None for j, mate2 in enumerate( fastqReader( open( mate2_filename, 'rb' ), format = type ) ): mate1 = mate1_input.get( joiner.get_paired_identifier( mate2 ) ) if not mate1: out_singles.write( mate2 ) nof_singles += 1 if (i is None) and (j is None): print "Your input files contained no valid FASTQ sequences." else: print 'There were %s single reads.' % ( nof_singles ) print 'Interlaced %s pairs of sequences.' % ( nof_pairs ) mate1_input.close() mate2_input.close() out_pairs.close() out_singles.close()
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) out = fastqWriter(open(output_filename, 'wb'), format=input_type) i = None reads_kept = 0 for i, fastq_read in enumerate( fastqReader(open(input_filename), format=input_type)): local = {'fastq_read': fastq_read, 'ret_val': False} execfile(script_filename, {}, local) if local['ret_val']: out.write(fastq_read) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0)
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() out = fastqWriter( open( output_filename, 'wb' ), format = output_type, force_quality_encoding = force_quality_encoding ) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader( open( input_filename ), format = input_type, apply_galaxy_conventions = True ) ): if summarize_input: aggregator.consume_read( fastq_read ) out.write( fastq_read ) out.close() if read_count is not None: print "Groomed %i %s reads into %s reads." % ( read_count + 1, input_type, output_type ) if input_type != output_type and 'solexa' in [ input_type, output_type ]: print "Converted between Solexa and PHRED scores." if summarize_input: print "Based upon quality and sequence, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" ) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] ) else: print "No valid FASTQ reads were provided."
def main(): #Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] if input1_type != input2_type: print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) joiner = fastqJoiner( input1_type ) out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) i = None skip_count = 0 for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): identifier = joiner.get_paired_identifier( fastq_read ) fastq_paired = input2.get( identifier ) if fastq_paired is None: skip_count += 1 else: out.write( joiner.join( fastq_read, fastq_paired ) ) out.close() if i is None: print "Your file contains no valid FASTQ reads." else: print input2.has_data() print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None reads_kept = 0 for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): local = {'fastq_read':fastq_read, 'ret_val':False} execfile( script_filename, {}, local ) if local['ret_val']: out.write( fastq_read ) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE)) i = None reads_manipulated = 0 writer = fastqWriter(path=output_filename, format=input_type) reader = fastqReader(path=input_filename, format=input_type) with writer, reader: for i, fastq_read in enumerate(reader): new_read = fastq_manipulator.match_and_manipulate_read(fastq_read) if new_read: writer.write(new_read) if new_read != fastq_read: reads_manipulated += 1 if i is None: print("Your file contains no valid FASTQ reads.") else: print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
def main(): #Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] if input1_type != input2_type: print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type) input2 = fastqNamedReader(open(input2_filename, 'rb'), input2_type) joiner = fastqJoiner(input1_type) out = fastqWriter(open(output_filename, 'wb'), format=input1_type) i = None skip_count = 0 for i, fastq_read in enumerate( fastqReader(open(input1_filename, 'rb'), format=input1_type)): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = input2.get(identifier) if fastq_paired is None: skip_count += 1 else: out.write(joiner.join(fastq_read, fastq_paired)) out.close() if i is None: print "Your file contains no valid FASTQ reads." else: print input2.has_data() print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0)
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) # Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. # This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter(path=output_filename, format=input_type) i = None reads_kept = 0 execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader(path=input_filename, format=input_type)): ret_val = fastq_read_pass_filter( fastq_read ) # fastq_read_pass_filter defined in script_filename # NOQA if ret_val: out.write(fastq_read) reads_kept += 1 out.close() if i is None: print("Your file contains no valid fastq reads.") else: print('Kept %s of %s reads (%.2f%%).' % (reads_kept, i + 1, float(reads_kept) / float(i + 1) * 100.0))
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] left_offset = sys.argv[3] right_offset = sys.argv[4] percent_offsets = sys.argv[5] == 'offsets_percent' input_type = sys.argv[6] or 'sanger' keep_zero_length = sys.argv[7] == 'keep_zero_length' out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) num_reads_excluded = 0 num_reads = None for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): if percent_offsets: left_column_offset = int( round( float( left_offset ) / 100.0 * float( len( fastq_read ) ) ) ) right_column_offset = int( round( float( right_offset ) / 100.0 * float( len( fastq_read ) ) ) ) else: left_column_offset = int( left_offset ) right_column_offset = int( right_offset ) if right_column_offset > 0: right_column_offset = -right_column_offset else: right_column_offset = None fastq_read = fastq_read.slice( left_column_offset, right_column_offset ) if keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if num_reads is None: print "No valid fastq reads could be processed." else: print "%i fastq reads were processed." % ( num_reads + 1 ) if num_reads_excluded: print "%i reads of zero length were excluded from the output." % num_reads_excluded
def main(): # Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' # Save script file for debuging/verification info later os.mkdir(additional_files_path) shutil.copy(script_filename, os.path.join(additional_files_path, 'debug.txt')) fastq_manipulator = imp.load_module('fastq_manipulator', open(script_filename), script_filename, ('', 'r', imp.PY_SOURCE)) out = fastqWriter(path=output_filename, format=input_type) i = None reads_manipulated = 0 for i, fastq_read in enumerate(fastqReader(path=input_filename, format=input_type)): new_read = fastq_manipulator.match_and_manipulate_read(fastq_read) if new_read: out.write(new_read) if new_read != fastq_read: reads_manipulated += 1 out.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print('Manipulated %s of %s reads (%.2f%%).' % (reads_manipulated, i + 1, float(reads_manipulated) / float(i + 1) * 100.0))
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser(usage=usage) parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'solexa', 'illumina', 'sanger.gz', 'solexa.gz', 'illumina.gz', 'sanger.bz2', 'solexa.bz2', 'illumina.bz2'), help='FASTQ variant type') parser.add_option('-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use') parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'), help='Mask base when score is') parser.add_option('-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score') parser.add_option("-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Need to specify an input file and an output file") score_comparer = get_score_comparer(options.score_comparison) if options.lowercase: base_masker = str.lower else: base_masker = BaseReplacer(options.mask_character) out = fastqWriter(path=args[1], format=options.format) num_reads = None for num_reads, fastq_read in enumerate(fastqReader(path=args[0], format=options.format)): sequence_list = list(fastq_read.sequence) for i, quality_score in enumerate(fastq_read.get_decimal_quality_scores()): if score_comparer(quality_score, options.quality_score): sequence_list[i] = base_masker(sequence_list[i]) fastq_read.sequence = "".join(sequence_list) out.write(fastq_read) if num_reads is not None: print("Processed %i %s reads." % (num_reads + 1, options.format)) else: print("No valid FASTQ reads were provided.")
def main(): #Read command line arguments input_filename = sys.argv[1] script_filename = sys.argv[2] output_filename = sys.argv[3] additional_files_path = sys.argv[4] input_type = sys.argv[5] or 'sanger' #Save script file for debuging/verification info later os.mkdir( additional_files_path ) shutil.copy( script_filename, os.path.join( additional_files_path, 'debug.txt' ) ) ## Dan, Others: Can we simply drop the "format=input_type" here since it is specified in reader. ## This optimization would cut runtime roughly in half (for my test case anyway). -John out = fastqWriter( open( output_filename, 'wb' ), format = input_type ) i = None reads_kept = 0 execfile(script_filename, globals()) for i, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ): ret_val = fastq_read_pass_filter( fastq_read ) ## fastq_read_pass_filter defined in script_filename if ret_val: out.write( fastq_read ) reads_kept += 1 out.close() if i is None: print "Your file contains no valid fastq reads." else: print 'Kept %s of %s reads (%.2f%%).' % ( reads_kept, i + 1, float( reads_kept ) / float( i + 1 ) * 100.0 )
def fastq_filter(in_file, out_file, iterator_filter): count = 0 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) for record in iterator_filter(reader): count += 1 writer.write(record) writer.close() reader.close() return count
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() out = fastqWriter(path=output_filename, format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader(path=input_filename, format=input_type, apply_galaxy_conventions=True)): if summarize_input: aggregator.consume_read(fastq_read) out.write(fastq_read) out.close() if read_count is not None: print("Groomed %i %s reads into %s reads." % (read_count + 1, input_type, output_type)) if input_type != output_type and 'solexa' in [input_type, output_type]: print("Converted between Solexa and PHRED scores.") if summarize_input: print( "Based upon quality and sequence, the input data is valid for: %s" % (", ".join(aggregator.get_valid_formats()) or "None")) ascii_range = aggregator.get_ascii_range() decimal_range = aggregator.get_decimal_range() print( "Input ASCII range: %s(%i) - %s(%i)" % (repr(ascii_range[0]), ord(ascii_range[0]), repr( ascii_range[1]), ord(ascii_range[1])) ) # print using repr, since \x00 (null) causes info truncation in galaxy when printed print("Input decimal range: %i - %i" % (decimal_range[0], decimal_range[1])) else: print("No valid FASTQ reads were provided.")
def main(): # Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] fastq_style = sys.argv[6] or 'old' paste = sys.argv[7] or '' # -- if input1_type != input2_type: print( "WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type)) if fastq_style == 'new': sep = sniff_sep(input1_filename) joiner = FastqJoiner(input1_type, sep=sep, paste=paste) else: joiner = fq.fastqJoiner(input1_type, paste=paste) # -- i = None skip_count = 0 writer = fq.fastqWriter(path=output_filename, format=input1_type) reader1 = fq.fastqReader(path=input1_filename, format=input1_type) reader2 = fq.fastqNamedReader(path=input2_filename, format=input2_type) with writer, reader1, reader2: for i, fastq_read in enumerate(reader1): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = reader2.get(identifier) if fastq_paired is None: skip_count += 1 else: writer.write(joiner.join(fastq_read, fastq_paired)) # this indent is correct: we still need access to reader2 if i is None: print("Your file contains no valid FASTQ reads.") else: print(reader2.has_data()) print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
def main(): # Read command line arguments fasta_filename = sys.argv[1] fasta_type = sys.argv[ 2] or 'fasta' # should always be fasta or csfasta? what if txt? qual_filename = sys.argv[3] qual_type = sys.argv[4] or 'qualsanger' # qual454 qualsolid output_filename = sys.argv[5] force_quality_encoding = sys.argv[6] if force_quality_encoding == 'None': force_quality_encoding = None format = 'sanger' if fasta_type == 'csfasta' or qual_type == 'qualsolid': format = 'cssanger' elif qual_type == 'qualsolexa': format = 'solexa' elif qual_type == 'qualillumina': format = 'illumina' out = fastqWriter(path=output_filename, format=format, force_quality_encoding=force_quality_encoding) if qual_filename == 'None': qual_input = fastqFakeFastaScoreReader( format, quality_encoding=force_quality_encoding) else: qual_input = fastaNamedReader(open(qual_filename, 'rt')) fastq_combiner = fastqCombiner(format) i = None skip_count = 0 for i, sequence in enumerate(fastaReader(open(fasta_filename, 'rt'))): quality = qual_input.get(sequence) if quality: fastq_read = fastq_combiner.combine(sequence, quality) out.write(fastq_read) else: skip_count += 1 out.close() if i is None: print("Your file contains no valid FASTA sequences.") else: print(qual_input.has_data()) print('Combined %s of %s sequences with quality scores (%.2f%%).' % (i - skip_count + 1, i + 1, float(i - skip_count + 1) / float(i + 1) * 100.0))
def main(): #Read command line arguments fasta_filename = sys.argv[1] fasta_type = sys.argv[2] or 'fasta' #should always be fasta or csfasta? what if txt? qual_filename = sys.argv[3] qual_type = sys.argv[4] or 'qualsanger' #qual454 qualsolid output_filename = sys.argv[5] force_quality_encoding = sys.argv[6] if force_quality_encoding == 'None': force_quality_encoding = None format = 'sanger' if fasta_type == 'csfasta' or qual_type == 'qualsolid': format = 'cssanger' elif qual_type == 'qualsolexa': format = 'solexa' elif qual_type == 'qualillumina': format = 'illumina' out = fastqWriter( open( output_filename, 'wb' ), format = format, force_quality_encoding = force_quality_encoding ) if qual_filename == 'None': qual_input = fastqFakeFastaScoreReader( format, quality_encoding = force_quality_encoding ) else: qual_input = fastaNamedReader( open( qual_filename, 'rb' ) ) fastq_combiner = fastqCombiner( format ) i = None skip_count = 0 for i, sequence in enumerate( fastaReader( open( fasta_filename, 'rb' ) ) ): quality = qual_input.get( sequence ) if quality: fastq_read = fastq_combiner.combine( sequence, quality ) out.write( fastq_read ) else: skip_count += 1 out.close() if i is None: print "Your file contains no valid FASTA sequences." else: print qual_input.has_data() print 'Combined %s of %s sequences with quality scores (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 )
def partition(input_filename, temp_output_filename, fileCount, quality_encoding, verbose): # print 'Starting Thread: ' + str(fileCount) input_type = ARGV[1] output_type = ARGV[3] force_quality_encoding = quality_encoding summarize_input = verbose if force_quality_encoding == 'None': force_quality_encoding = None aggregator = fastqAggregator() temp_process_file = fastqWriter( open(temp_output_filename, 'wb'), format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader = fastqVerboseErrorReader else: reader = fastqReader for read_count, fastq_read in enumerate( reader(open(input_filename, 'rb'), format=input_type, apply_galaxy_conventions=True)): if summarize_input: aggregator.consume_read(fastq_read) temp_process_file.write(fastq_read) # print "Just wrote (%d): " % read_count + str(fastq_read) temp_process_file.close() if read_count is not None: if input_type != output_type and 'solexa' in [input_type, output_type]: print "Converted between Solexa and PHRED scores." if summarize_input: with open(temp_output_filename + "_summary", 'w') as summaryLogFile: pickle.dump(aggregator, summaryLogFile) else: print "No valid FASTQ reads were provided."
def main(): input_filename = sys.argv[1] output_filename = sys.argv[2] left_offset = sys.argv[3] right_offset = sys.argv[4] percent_offsets = sys.argv[5] == 'offsets_percent' input_type = sys.argv[6] or 'sanger' keep_zero_length = sys.argv[7] == 'keep_zero_length' out = fastqWriter(path=output_filename, format=input_type) num_reads_excluded = 0 num_reads = None for num_reads, fastq_read in enumerate( fastqReader(path=input_filename, format=input_type)): if percent_offsets: left_column_offset = int( round(float(left_offset) / 100.0 * float(len(fastq_read)))) right_column_offset = int( round(float(right_offset) / 100.0 * float(len(fastq_read)))) else: left_column_offset = int(left_offset) right_column_offset = int(right_offset) if right_column_offset != 0: right_column_offset = -right_column_offset else: right_column_offset = None fastq_read = fastq_read.slice(left_column_offset, right_column_offset) if keep_zero_length or len(fastq_read): out.write(fastq_read) else: num_reads_excluded += 1 out.close() if num_reads is None: print("No valid fastq reads could be processed.") else: print("%i fastq reads were processed." % (num_reads + 1)) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded)
def main(): # Read command line arguments input1_filename = sys.argv[1] input1_type = sys.argv[2] or 'sanger' input2_filename = sys.argv[3] input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] fastq_style = sys.argv[6] or 'old' paste = sys.argv[7] or '' # -- if input1_type != input2_type: print("WARNING: You are trying to join files of two different types: %s and %s." % (input1_type, input2_type)) if fastq_style == 'new': sep = sniff_sep(input1_filename) joiner = FastqJoiner(input1_type, sep=sep, paste=paste) else: joiner = fq.fastqJoiner(input1_type, paste=paste) # -- input2 = fq.fastqNamedReader(path=input2_filename, format=input2_type) out = fq.fastqWriter(path=output_filename, format=input1_type) i = None skip_count = 0 for i, fastq_read in enumerate(fq.fastqReader(path=input1_filename, format=input1_type)): identifier = joiner.get_paired_identifier(fastq_read) fastq_paired = input2.get(identifier) if fastq_paired is None: skip_count += 1 else: out.write(joiner.join(fastq_read, fastq_paired)) out.close() if i is None: print("Your file contains no valid FASTQ reads.") else: print(input2.has_data()) print('Joined %s of %s read pairs (%.2f%%).' % (i - skip_count + 1, i + 1, (i - skip_count + 1) / (i + 1) * 100.0))
def run(self): aggregator = fastqAggregator() reader_class = fastqReader if self.summarize_input: reader_class = fastqVerboseErrorReader read_count = None writer = fastqWriter( path=self.output_filename, format=self.output_type, force_quality_encoding=self.force_quality_encoding) reader = reader_class(fh=self.file_handle, path=self.input_filename, format=self.input_type, apply_galaxy_conventions=True, fix_id=self.fix_id) with writer, reader: for read_count, fastq_read in enumerate(reader): if self.summarize_input: aggregator.consume_read(fastq_read) writer.write(fastq_read) self._print_output(read_count, aggregator)
def main(): input_filename = sys.argv[1] input_type = sys.argv[2] output_filename = sys.argv[3] output_type = sys.argv[4] force_quality_encoding = sys.argv[5] summarize_input = sys.argv[6] == 'summarize_input' if force_quality_encoding == 'None': force_quality_encoding = None fix_id = False # fix inconsistent identifiers (SRA data dumps) if len(sys.argv) > 7: fix_id = sys.argv[7] == 'fix_id' aggregator = fastqAggregator() out = fastqWriter(path=output_filename, format=output_type, force_quality_encoding=force_quality_encoding) read_count = None if summarize_input: reader_type = fastqVerboseErrorReader else: reader_type = fastqReader reader = reader_type(path=input_filename, format=input_type, apply_galaxy_conventions=True, fix_id=fix_id) for read_count, fastq_read in enumerate(reader): if summarize_input: aggregator.consume_read(fastq_read) out.write(fastq_read) out.close() _print_output(read_count, input_type, output_type, summarize_input, aggregator)
in_handle.close() else: # Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith( "qual" ): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) marker = "@" else: sys.exit("Unsupported file type %r" % seq_format) # Now do the renaming count = 0 renamed = 0 for record in reader: # The [1:] is because the fastaReader leaves the > on the identifier, # likewise the fastqReader leaves the @ on the identifier try: idn, descr = record.identifier[1:].split(None, 1) except ValueError: idn = record.identifier[1:] descr = None if idn in rename:
re_illumina_f = re.compile(r"^@[a-zA-Z0-9_:-]+ 1:.*$") re_illumina_r = re.compile(r"^@[a-zA-Z0-9_:-]+ 2:.*$") assert re_illumina_f.match( "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") assert re_illumina_r.match( "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") assert not re_illumina_f.match( "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 2:N:0:TGNCCA") assert not re_illumina_r.match( "@HWI-ST916:79:D04M5ACXX:1:1101:10000:100326 1:N:0:TGNCCA") count, forward, reverse, neither, pairs, singles = 0, 0, 0, 0, 0, 0 in_handle = open(input_fastq) if pairs_fastq: pairs_f_writer = fastqWriter(open(pairs_fastq, "w"), format) pairs_r_writer = pairs_f_writer else: pairs_f_writer = fastqWriter(open(pairs_f_fastq, "w"), format) pairs_r_writer = fastqWriter(open(pairs_r_fastq, "w"), format) singles_writer = fastqWriter(open(singles_fastq, "w"), format) last_template, buffered_reads = None, [] for record in fastqReader(in_handle, format): count += 1 name = record.identifier.split(None, 1)[0] assert name[0] == "@", record.identifier #Quirk of the Galaxy parser is_forward = False suffix = re_f.search(name) if suffix: #============
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser( usage=usage ) parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") ( options, args ) = parser.parse_args() if len ( args ) != 2: parser.error( "Need to specify an input file and an output file" ) if options.window_size < 1: parser.error( 'You must specify a strictly positive window size' ) if options.window_step < 1: parser.error( 'You must specify a strictly positive step size' ) #determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range( min( options.exclude_count, options.window_size ) ): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range( options.window_size ): if window_index not in exclude_list: new_exclude = sorted( exclude_list + [ window_index ] ) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append( new_exclude ) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range( options.window_size ): last_exclude_indexes.append( [ window_index ] ) exclude_window_indexes = list( last_exclude_indexes ) out = fastqWriter( open( args[1], 'wb' ), format = options.format ) action = ACTION_METHODS[ options.aggregation_action ] num_reads = None num_reads_excluded = 0 for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): for trim_end in options.trim_ends: quality_list = fastq_read.get_decimal_quality_scores() if trim_end == '5': lwindow_position = 0 #left position of window while True: if lwindow_position >= len( quality_list ): fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( lwindow_position, None ) break lwindow_position += options.window_step else: rwindow_position = len( quality_list ) #right position of window while True: lwindow_position = rwindow_position - options.window_size #left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): fastq_read = fastq_read.slice( None, rwindow_position ) break rwindow_position -= options.window_step if options.keep_zero_length or len( fastq_read ): out.write( fastq_read ) else: num_reads_excluded += 1 out.close() if num_reads is None: print "No valid FASTQ reads could be processed." else: print "%i FASTQ reads were processed." % ( num_reads + 1 ) if num_reads_excluded: print "%i reads of zero length were excluded from the output." % num_reads_excluded
in_handle.seek(0) #start again after getting manifest count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename)) out_handle.close() in_handle.close() else: #Use Galaxy for FASTA, QUAL or FASTQ if seq_format.lower() in ["fasta", "csfasta"] \ or seq_format.lower().startswith("qual"): from galaxy_utils.sequence.fasta import fastaReader, fastaWriter reader = fastaReader(open(in_file, "rU")) writer = fastaWriter(open(out_file, "w")) marker = ">" elif seq_format.lower().startswith("fastq"): from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) marker = "@" else: sys.exit("Unsupported file type %r" % seq_format) #Now do the renaming count = 0 renamed = 0 for record in reader: #The [1:] is because the fastaReader leaves the > on the identifier, #likewise the fastqReader leaves the @ on the identifier try: idn, descr = record.identifier[1:].split(None, 1) except ValueError: idn = record.identifier[1:] descr = None if idn in rename:
in_handle = open(in_file, "rb") try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.seek(0) out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) writer.write_file(process(SffIterator(in_handle))) #End of SFF code elif seq_format.lower().startswith("fastq"): in_handle = open(in_file, "rU") out_handle = open(out_file, "w") reader = fastqReader(in_handle) writer = fastqWriter(out_handle) if forward: for record in reader: seq = record.sequence.upper() result = primer.search(seq) if result: #Forward primer, take everything after it cut = result.end() record.sequence = seq[cut:] if len(record.sequence) >= min_len: record.quality = record.quality[cut:] clipped += 1 writer.write(record) else: short_clipped += 1 elif keep_negatives:
def main(): # Parse Command Line try: tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:] except ValueError: stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) try: columns = [int(arg) - 1 for arg in cols_arg.split(",")] except ValueError: stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg) if out_positive_file == "-" and out_negative_file == "-": stop_err("Neither output file requested") # Read tabular file and record all specified identifiers ids = set() handle = open(tabular_file, "rU") if len(columns) > 1: # General case of many columns for line in handle: if line.startswith("#"): # Ignore comments continue parts = line.rstrip("\n").split("\t") for col in columns: ids.add(parts[col]) print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) else: # Single column, special case speed up col = columns[0] for line in handle: if not line.startswith("#"): ids.add(line.rstrip("\n").split("\t")[col]) print "Using %i IDs from tabular file" % (len(ids)) handle.close() if seq_format.lower() == "sff": # Now write filtered SFF file based on IDs from BLAST file try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None # This makes two passes though the SFF file with isn't so efficient, # but this makes the code simple. if out_positive_file != "-": out_handle = open(out_positive_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids) out_handle.close() if out_negative_file != "-": out_handle = open(out_negative_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids) out_handle.close() # And we're done in_handle.close() # At the time of writing, Galaxy doesn't show SFF file read counts, # so it is useful to put them in stdout and thus shown in job info. if out_positive_file != "-" and out_negative_file != "-": print "%i with and %i without specified IDs" % (pos_count, neg_count) elif out_positive_file != "-": print "%i with specified IDs" % pos_count elif out_negative_file != "-": print "%i without specified IDs" % neg_count elif seq_format.lower() == "fasta": # Write filtered FASTA file based on IDs from tabular file reader = fastaReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTA files" positive_writer = fastaWriter(open(out_positive_file, "w")) negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTA file" positive_writer = fastaWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTA file" negative_writer = fastaWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the > on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() elif seq_format.lower().startswith("fastq"): # Write filtered FASTQ file based on IDs from tabular file from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTQ files" positive_writer = fastqWriter(open(out_positive_file, "w")) negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTQ file" positive_writer = fastqWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) positive_writer.close() elif out_negative_file != "-": print "Generating non-matching FASTQ file" negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if not record.identifier or record.identifier.split()[0][1:] not in ids: negative_writer.write(record) negative_writer.close() else: stop_err("Unsupported file type %r" % seq_format)
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser(usage=usage) parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'solexa', 'illumina'), help='FASTQ variant type') parser.add_option('-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use') parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt', 'ge', 'eq', 'lt', 'le', 'ne'), help='Mask base when score is') parser.add_option('-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score') parser.add_option("-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Need to specify an input file and an output file") score_comparer = get_score_comparer(options.score_comparison) if options.lowercase: base_masker = string.lower else: base_masker = BaseReplacer(options.mask_character) out = fastqWriter(open(args[1], 'wb'), format=options.format) num_reads = None num_reads_excluded = 0 for num_reads, fastq_read in enumerate( fastqReader(open(args[0]), format=options.format)): sequence_list = list(fastq_read.sequence) for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores()): if score_comparer(quality_score, options.quality_score): sequence_list[i] = base_masker(sequence_list[i]) fastq_read.sequence = "".join(sequence_list) out.write(fastq_read) if num_reads is not None: print "Processed %i %s reads." % (num_reads + 1, options.format) else: print "No valid FASTQ reads were provided."
ids.add(parts[col]) print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns)) else: # Single column, special case speed up col = columns[0] for line in handle: if not line.startswith("#"): ids.add(line.rstrip("\n").split("\t")[col]) print "Using %i IDs from tabular file" % (len(ids)) handle.close() # Write filtered FASTQ file based on IDs from tabular file reader = fastqReader(open(in_file, "rU")) if out_positive_file != "-" and out_negative_file != "-": print "Generating two FASTQ files" positive_writer = fastqWriter(open(out_positive_file, "w")) negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file != "-": print "Generating matching FASTQ file" positive_writer = fastqWriter(open(out_positive_file, "w")) for record in reader: # The [1:] is because the fastaReader leaves the @ on the identifer. if record.identifier and record.identifier.split()[0][1:] in ids:
def main(): usage = "usage: %prog [options] input_file output_file" parser = OptionParser(usage=usage) parser.add_option('-f', '--format', dest='format', type='choice', default='sanger', choices=('sanger', 'cssanger', 'solexa', 'illumina', 'sanger.gz', 'cssanger.gz', 'solexa.gz', 'illumina.gz', 'sanger.bz2', 'cssanger.bz2', 'solexa.bz2', 'illumina.bz2'), help='FASTQ variant type') parser.add_option('-s', '--window_size', type="int", dest='window_size', default='1', help='Window size') parser.add_option('-t', '--window_step', type="int", dest='window_step', default='1', help='Window step') parser.add_option('-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5', '3', '53', '35'), help='Ends to Trim') parser.add_option('-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min', 'max', 'sum', 'mean'), help='Aggregate action for window') parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help= 'Maximum number of bases to exclude from the window during aggregation' ) parser.add_option('-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>', '>=', '==', '<', '<=', '!='), help='Keep read when aggregate score is') parser.add_option('-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score') parser.add_option("-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Need to specify an input file and an output file") if options.window_size < 1: parser.error('You must specify a strictly positive window size') if options.window_step < 1: parser.error('You must specify a strictly positive step size') # determine an exhaustive list of window indexes that can be excluded from aggregation exclude_window_indexes = [] last_exclude_indexes = [] for exclude_count in range(min(options.exclude_count, options.window_size)): if last_exclude_indexes: new_exclude_indexes = [] for exclude_list in last_exclude_indexes: for window_index in range(options.window_size): if window_index not in exclude_list: new_exclude = sorted(exclude_list + [window_index]) if new_exclude not in exclude_window_indexes + new_exclude_indexes: new_exclude_indexes.append(new_exclude) exclude_window_indexes += new_exclude_indexes last_exclude_indexes = new_exclude_indexes else: for window_index in range(options.window_size): last_exclude_indexes.append([window_index]) exclude_window_indexes = list(last_exclude_indexes) out = fastqWriter(path=args[1], format=options.format) action = ACTION_METHODS[options.aggregation_action] num_reads = None num_reads_excluded = 0 for num_reads, fastq_read in enumerate( fastqReader(path=args[0], format=options.format)): for trim_end in options.trim_ends: quality_list = fastq_read.get_decimal_quality_scores() if trim_end == '5': lwindow_position = 0 # left position of window while True: if lwindow_position >= len(quality_list): fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[lwindow_position:lwindow_position + options.window_size], options.score_comparison, options.quality_score, exclude_window_indexes): fastq_read = fastq_read.slice(lwindow_position, None) break lwindow_position += options.window_step else: rwindow_position = len( quality_list) # right position of window while True: lwindow_position = rwindow_position - options.window_size # left position of window if rwindow_position <= 0 or lwindow_position < 0: fastq_read.sequence = '' fastq_read.quality = '' break if exclude_and_compare( action, quality_list[lwindow_position:rwindow_position], options.score_comparison, options.quality_score, exclude_window_indexes): fastq_read = fastq_read.slice(None, rwindow_position) break rwindow_position -= options.window_step if options.keep_zero_length or len(fastq_read): out.write(fastq_read) else: num_reads_excluded += 1 out.close() if num_reads is None: print("No valid FASTQ reads could be processed.") else: print("%i FASTQ reads were processed." % (num_reads + 1)) if num_reads_excluded: print("%i reads of zero length were excluded from the output." % num_reads_excluded)
#And we're done in_handle.close() #At the time of writing, Galaxy doesn't show SFF file read counts, #so it is useful to put them in stdout and thus shown in job info. print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower()=="fasta": #Write filtered FASTA file based on IDs from tabular file pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) print "%i with and %i without specified IDs" % (pos_count, neg_count) elif seq_format.lower().startswith("fastq"): #Write filtered FASTQ file based on IDs from tabular file from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) if out_positive_file is not None and out_negative_file is not None: print "Generating two FASTQ files" positive_writer = fastqWriter(open(out_positive_file, "w")) negative_writer = fastqWriter(open(out_negative_file, "w")) for record in reader: #The [1:] is because the fastaReader leaves the > on the identifier. if record.identifier and clean_name(record.identifier.split()[0][1:]) in ids: positive_writer.write(record) else: negative_writer.write(record) positive_writer.close() negative_writer.close() elif out_positive_file is not None: print "Generating matching FASTQ file" positive_writer = fastqWriter(open(out_positive_file, "w")) for record in reader: #The [1:] is because the fastaReader leaves the > on the identifier. if record.identifier and clean_name(record.identifier.split()[0][1:]) in ids: