def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path,ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = MinimalFastqParser(ih, strict=False) joined_fastq_iter = MinimalFastqParser(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label,joined_seq,joined_qual in joined_fastq_iter: index_label,index_seq,index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label,index_seq,index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration, "\n\nReached end of index-reads file"+\ " before iterating through joined paired-end-reads file!"+\ " Except for missing paired-end reads that did not survive"+\ " assembly, your index and paired-end reads files must be in"+\ " the same order! Also, check that the index-reads and"+\ " paired-end reads have identical headers. The last joined"+\ " paired-end ID processed was:\n\'%s\'\n" %(joined_label) else: fastq_string = '@%s\n%s\n+\n%s\n'\ %(index_label,index_seq,index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path, ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = MinimalFastqParser(ih, strict=False) joined_fastq_iter = MinimalFastqParser(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label, joined_seq, joined_qual in joined_fastq_iter: index_label, index_seq, index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label, index_seq, index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration("\n\nReached end of index-reads file" + " before iterating through joined paired-end-reads file!" + " Except for missing paired-end reads that did not survive" + " assembly, your index and paired-end reads files must be in" + " the same order! Also, check that the index-reads and" + " paired-end reads have identical headers. The last joined" + " paired-end ID processed was:\n\'%s\'\n" % (joined_label)) else: fastq_string = '@%s\n%s\n+\n%s\n'\ % (index_label, index_seq, index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) verbose = opts.verbose input_seqs_fp = opts.input_seqs_fp fileSize = get_file_size(input_seqs_fp) if opts.file_type: fileType = opts.file_type else: fileType = splitext(input_seqs_fp)[1].split('.')[1] output_fp = opts.output_fp # if the output fp isn't specified, create one if not output_fp: input_file_basename, input_file_ext = \ splitext(split(input_seqs_fp)[1]) output_fp = '%s_counts.txt' % (input_file_basename) input_seqs = open(input_seqs_fp, "U") output = open(output_fp, "w") # count the number of seqs for each unique sample number_seqs_bySample = {} printcounter = 0 if fileType == 'fasta' or fileType == 'fa': for label, seq in MinimalFastaParser(input_seqs): matchID = re.match('^.*barcodelabel=(.*);$',label) sampleID = matchID.group(1) if sampleID in number_seqs_bySample: number_seqs_bySample[sampleID] += 1 else: number_seqs_bySample[sampleID] = 1 if printcounter == 1000: pos = input_seqs.tell() display_progress(pos, fileSize) printcounter = 0 printcounter += 1 elif fileType == 'fastq' or fileType == 'fq': for label, seq, qual in MinimalFastqParser(input_seqs,strict=False): matchID = re.match('^.*barcodelabel=(.*);$',label) sampleID = matchID.group(1) if sampleID in number_seqs_bySample: number_seqs_bySample[sampleID] += 1 else: number_seqs_bySample[sampleID] = 1 if printcounter == 1000: pos = input_seqs.tell() display_progress(pos, fileSize) printcounter = 0 printcounter += 1 else: print "Invalid file type" for key in number_seqs_bySample: output.write('%s\t%s\n' %(key,number_seqs_bySample[key])) sys.stdout.write('\n') input_seqs.close() output.close()
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False): """Filter a fastq file to include only sequences listed in seqs_to_keep """ input_seqs = MinimalFastqParser(open(input_seqs_fp, 'U'), strict=False) output_f = open(output_seqs_fp, 'w') return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def convert_fastaqual(fasta_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False, per_file_buffer_size=100000): '''Takes a FASTQfile, generates FASTA and QUAL file(s) fasta_file_path: filepath of input FASTQ file. output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' # rename this to avoid confusion... fastq_fp = fasta_file_path # if we are NOT using multiple output files, then open our two (and only) # output files here if not multiple_output_files: fasta_out_fp = get_filename_with_new_ext(fastq_fp, '.fna', output_directory) qual_out_fp = get_filename_with_new_ext(fastq_fp, '.qual', output_directory) fasta_out_f = open(fasta_out_fp, 'w') qual_out_f = open(qual_out_fp, 'w') else: fasta_out_lookup = defaultdict(str) qual_out_lookup = defaultdict(str) for header, sequence, qual in MinimalFastqParser(open(fastq_fp, 'U'), strict=False): label = header.split()[0] sample_id = label.split('_')[0] if multiple_output_files: fasta_out_fp = get_filename_with_new_ext(fastq_fp, '_' + sample_id + '.fna', output_directory) qual_out_fp = get_filename_with_new_ext(fastq_fp, '_' + sample_id + '.qual', output_directory) if full_fasta_headers: label = header #convert quality scores qual_scores = [] for qual_char in qual: if (ord(qual_char) - ascii_increment) < 0: raise ValueError,("Output qual scores are negative values. " "Use different ascii_increment value than %s" % str(ascii_increment)) else: qual_scores.append(str(ord(qual_char) - ascii_increment)) #write QUAL file, 60 qual scores per line qual_record = '>' + label + '\n' for i in range(0, len(qual_scores), 60): qual_record += ' '.join(qual_scores[i:i+60]) + '\n' if multiple_output_files: qual_out_lookup[qual_out_fp] += qual_record else: qual_out_f.write(qual_record) #write FASTA file fasta_record = '>%s\n%s\n' % (label, sequence) if multiple_output_files: fasta_out_lookup[fasta_out_fp] += fasta_record else: fasta_out_f.write(fasta_record) # if we're writing multiple output files, we must close after each # sequeunce write to avoid potentiallyusing up all the OS's filehandles if multiple_output_files: if fasta_out_lookup[fasta_out_fp] >= per_file_buffer_size: fasta_f = open(fasta_out_fp, 'a') fasta_f.write(fasta_out_lookup[fasta_out_fp]) fasta_f.close() fasta_out_lookup[fasta_out_fp] = '' qual_f = open(qual_out_fp, 'a') qual_f.write(qual_out_lookup[qual_out_fp]) qual_f.close() qual_out_lookup[qual_out_fp] = '' # if we have one output file, close it now if multiple_output_files: for fasta_out_fp, records in fasta_out_lookup.iteritems(): if records: fasta_f = open(fasta_out_fp, 'a') fasta_f.write(records) fasta_f.close() for qual_out_fp, records in qual_out_lookup.iteritems(): if records: qual_f = open(qual_out_fp, 'a') qual_f.write(records) qual_f.close() else: fasta_out_f.close() qual_out_f.close()
basename = splitext(filein)[0] #make directory to store cleaned sequences in if not exists(folderout + basename): mkdir(folderout + basename) else: print "Round", basename, "already cleaned" continue currfolder = ''.join([folderout, basename, "/", basename]) print "==ROUND " + basename + "==" #convert fastq to fasta if needed if args.q: print "==Converting to FASTA==" f = open(''.join([folderout, basename, ".fasta"]), 'w') for header, seq, qual in MinimalFastqParser(folderin + filein, strict=False): f.write(''.join([">", header, '\n', seq, '\n'])) f.close() filein = ''.join([folderout, basename, ".fasta"]) print "==Cleaning input sequences==" log = open(currfolder + "-cleanup.log", 'w') log.write(''.join([ "====================\nFile in: ", folderin, filein, "\nOutput Folder: ", currfolder, "\n3' primer: ", args.ep, "\nMin length: ", str(args.l), "\nMin duplicates: ", str(args.d), "\n====================\n" ])) #parse in sequences as (header, seq) tuples
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_to_ascii_f=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] # determine the version of casava that was used to generate the fastq # to determine how to compare header lines and decode ascii phred scores post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: check_header_match_f = check_header_match_180_or_later if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii33 else: check_header_match_f = check_header_match_pre180 if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii64 # determine the last unacceptable quality character if phred_quality_threshold != None: last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold) else: # disable quality filter last_bad_quality_char = '' # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set( [len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input read min_per_read_length = min_per_read_length_fraction * len( fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( MinimalFastqParser(fastq_barcode_f, strict=False), MinimalFastqParser(fastq_read_f, strict=False)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index],read_data[header_index])): raise FastqParseError,\ ("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = DNA.rc(barcode) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id == None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, last_bad_quality_char, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError,\ "Unknown quality filter result: %d" % quality_filter_result continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = DNA.rc(sequence) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f != None: log_str = format_split_libraries_fastq_log( count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f != None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def extract_barcodes(fastq1, fastq2=None, output_dir=".", input_type="barcode_single_end", bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", switch_bc_order=False, map_fp=None, attempt_read_orientation=False, disable_header_match=False): """ Main program function for extracting barcodes from reads fastq1: Open fastq file 1. fastq2: None or open fastq file 2. output_dir: Directory to write output parses sequences to. input_type: Specifies the type of parsing to be done. bc1_len: Length of barcode 1 to be parsed from fastq1 bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a stitched read. rev_comp_bc1: If True, reverse complement bc1 before writing. rev_comp_bc2: If True, reverse complement bc2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. map_fp: open file object of mapping file, requires a LinkerPrimerSequence and ReversePrimer field to be present. Used for orienting reads. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. disable_header_match: if True, suppresses checks between fastq headers. """ # Turn off extra file creation for single read. if input_type == "barcode_single_end" and attempt_read_orientation: attempt_read_orientation = False if attempt_read_orientation: header, mapping_data, run_description, errors, warnings =\ process_id_map(map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) output_bc_not_oriented = open( join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w") fastq1_out_not_oriented = open( join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w") fastq2_out_not_oriented = open( join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w") else: forward_primers = None reverse_primers = None output_bc_not_oriented = None fastq1_out_not_oriented = None fastq2_out_not_oriented = None output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w") if input_type in ["barcode_single_end", "barcode_paired_stitched"]: output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w") output_fastq2 = None final_fastq1_name = join(output_dir, "reads.fastq") elif input_type in ["barcode_paired_end"]: output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w") output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w") final_fastq1_name = join(output_dir, "reads1.fastq") else: output_fastq1 = None output_fastq2 = None if not fastq2: fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "bbbbbbbbbbbb"]) not_paired = True else: not_paired = False check_header_match_f = get_casava_version(fastq1) header_index = 0 for read1_data, read2_data in izip( MinimalFastqParser(fastq1, strict=False), MinimalFastqParser(fastq2, strict=False)): if not disable_header_match: if not check_header_match_f(read1_data[header_index],\ read2_data[header_index]): raise FastqParseError,\ ("Headers of read1 and read2 do not match. Can't continue. " "Confirm that the fastq sequences that you are " "passing match one another. --disable_header_match can be " "used to suppress header checks.") if input_type == "barcode_single_end": process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len, rev_comp_bc1) elif input_type == "barcode_paired_end": process_barcode_paired_end_data( read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, fastq2_out_not_oriented) elif input_type == "barcode_paired_stitched": process_barcode_paired_stitched( read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, switch_bc_order) elif input_type == "barcode_in_label": if not_paired: curr_read2_data = False else: curr_read2_data = read2_data process_barcode_in_label(read1_data, curr_read2_data, output_bc_fastq, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, char_delineator) output_bc_fastq.close() rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq")) if output_fastq1: output_fastq1.close() rename(output_fastq1.name, final_fastq1_name) if output_fastq2: output_fastq2.close() rename(output_fastq2.name, join(output_dir, "reads2.fastq")) if output_bc_not_oriented: rename(output_bc_not_oriented.name, join(output_dir, "barcodes_not_oriented.fastq")) if fastq1_out_not_oriented: rename(fastq1_out_not_oriented.name, join(output_dir, "reads1_not_oriented.fastq")) if fastq2_out_not_oriented: rename(fastq2_out_not_oriented.name, join(output_dir, "reads2_not_oriented.fastq"))
These contain information on the read number, so can differ """ header1 = header1.split(':') header2 = header2.split(':') for e1, e2 in zip(header1, header2): if e1.split(' ')[0] != e2.split(' ')[0]: return False return True print "Printing labels that do not match before space character:" mismatched_labels_found = False for read1_data, read2_data in izip(MinimalFastqParser(read1, strict=False), MinimalFastqParser(read2, strict=False)): if not check_header_match_180_or_later(read1_data[header_index], read2_data[header_index]): print "Mismatched labels: %s, %s" % (read1_data[header_index], read2_data[header_index]) mismatched_labels_found = True if not (len(read1_data[sequence_index]) == len(read1_data[quality_index])): print "Sequence and quality score lengths do not match for read 1 label %s " % read1_data[ header_index] if not (len(read2_data[sequence_index]) == len(read2_data[quality_index])): print "Sequence and quality score lengths do not match for read 2 label %s " % read2_data[ header_index] if not mismatched_labels_found:
def test_parse(self): """sequence and info objects should correctly match""" for label, seq, qual in MinimalFastqParser('data/fastq.txt'): self.assertTrue(label in data) self.assertEqual(seq, data[label]["seq"]) self.assertEqual(qual, data[label]["qual"])
def convert_fastaqual(fasta_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False): '''Takes a FASTQfile, generates FASTA and QUAL file(s) fasta_file_path: filepath of input FASTQ file. output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' fastq_file_path = fasta_file_path fasta_output = {} qual_output = {} fastq_file = open(fasta_file_path, 'U') # Need to open file the first time as "w", thereafter open as "a" sample_ids_written = {} for fastq_data in izip(MinimalFastqParser(fastq_file, strict=False)): sequence = fastq_data[0][1] qual = fastq_data[0][2] header = fastq_data[0][0] label = header.split()[0] sample_id = label.split('_')[0] if len(sequence) != len(qual): raise KeyError,("Number of quality scores "+\ "(%d) does not match number of positions (%d) for label: %s" %\ (len(qual), len(sequence), label)) if not multiple_output_files: output_fasta = path.join(output_directory, \ path.splitext(path.split(fastq_file_path)[1])[0] + '.fna') output_qual = path.join(output_directory, \ path.splitext(path.split(fastq_file_path)[1])[0] + '.qual') if output_fasta in sample_ids_written.keys(): sample_ids_written[output_fasta] = True else: sample_ids_written[output_fasta] = False try: # Create new file if first time writing, else append if sample_ids_written[output_fasta]: fasta_o = open(output_fasta, 'a') qual_o = open(output_qual, 'a') else: fasta_o = open(output_fasta, 'w') qual_o = open(output_qual, 'w') except IOError: raise IOError,("Could not open output FASTA or QUAL files, "+\ "please check file permissions.") fasta_output[sample_id] = output_fasta qual_output[sample_id] = output_qual if multiple_output_files: if sample_id not in fasta_output: output_fasta = path.join(output_directory, \ path.splitext(path.split(fastq_file_path)[1])[0] + \ '_' + sample_id + '.fna') if output_fasta in sample_ids_written.keys(): sample_ids_written[output_fasta] = True else: sample_ids_written[output_fasta] = False try: if sample_ids_written[output_fasta]: fasta_output[sample_id] = open(output_fasta, 'a') else: fasta_output[sample_id] = open(output_fasta, 'w') except IOError: raise IOError,("Could not open output FASTA file: %s" %\ output_fasta + '\n') fasta_output[sample_id] = output_fasta if sample_id not in qual_output: output_qual = path.join(output_directory, \ path.splitext(path.split(fastq_file_path)[1])[0] +'_'+ \ sample_id +'.qual') try: if sample_ids_written[output_fasta]: qual_output[sample_id] = open(output_qual, 'a') else: qual_output[sample_id] = open(output_qual, 'w') #qual_output[sample_id] = open(output_qual,'a') except IOError: fastq_file.close() raise IOError,("Could not open QUAL file for writing: %s" %\ output_qual + '\n') qual_output[sample_id] = output_qual if full_fasta_headers: label = header fasta_o = open(fasta_output[sample_id], 'a') qual_o = open(qual_output[sample_id], 'a') #write Fasta file fasta_o.write('>' + label + '\n') fasta_o.write(sequence + '\n') #convert quality scores qual_chars = list(qual) qual_scores = [] for qual_char in qual_chars: if (ord(qual_char) - ascii_increment) < -0: raise ValueError,("Output qual scores are negative values. "+ \ "Use different ascii_increment value than %s" %\ str(ascii_increment)) else: qual_scores.append(ord(qual_char) - ascii_increment) #write QUAL file score_numbers = [] for i, qual_score in enumerate(qual_scores): score_numbers.append(i) qual_o.write('>' + label + '\n') for i, qual_score in enumerate(qual_scores): if i % 60 == 0 and i != 0: qual_o.write('\n') qual_o.write(str(qual_score)) if (i + 1) % 60 != 0 and i != max(score_numbers): qual_o.write(' ') qual_o.write('\n') if multiple_output_files: fasta_o.close() qual_o.close()