def get_casava_version(fastq1): """ determine the version of casava that was used to generate the fastq fastq1: open fastq file or list of lines """ # to determine how to compare header lines and decode ascii phred scores # Modified code from split_libraries_fastq.py for this check if isinstance(fastq1, list): fastq_read_f_line1 = fastq1[0] else: fastq_read_f_line1 = fastq1.readline() fastq1.seek(0) post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: check_header_match_f = check_header_match_180_or_later else: check_header_match_f = check_header_match_pre180 return check_header_match_f
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_to_ascii_f=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] # determine the version of casava that was used to generate the fastq # to determine how to compare header lines and decode ascii phred scores post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: check_header_match_f = check_header_match_180_or_later if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii33 else: check_header_match_f = check_header_match_pre180 if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii64 # determine the last unacceptable quality character if phred_quality_threshold != None: last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold) else: # disable quality filter last_bad_quality_char = '' # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input read min_per_read_length = min_per_read_length_fraction * len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False), MinimalFastqParser(fastq_read_f,strict=False)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index],read_data[header_index])): raise FastqParseError,\ ("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = DNA.rc(barcode) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id == None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, last_bad_quality_char, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError,\ "Unknown quality filter result: %d" % quality_filter_result continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = DNA.rc(sequence) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f != None: log_str = format_split_libraries_fastq_log(count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f != None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts,bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_offset=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] if phred_offset is None: post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: phred_offset = 33 else: phred_offset = 64 if phred_offset == 33: check_header_match_f = check_header_match_180_or_later elif phred_offset == 64: check_header_match_f = check_header_match_pre180 else: raise ValueError("Invalid PHRED offset: %d" % phred_offset) # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set( [len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input # read min_per_read_length = min_per_read_length_fraction * \ len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( parse_fastq(fastq_barcode_f, strict=False, phred_offset=phred_offset), parse_fastq(fastq_read_f, strict=False, phred_offset=phred_offset)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index], read_data[header_index])): raise FastqParseError( "Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = str(DNA(barcode).rc()) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode( barcode, barcode_to_sample_id, barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id is None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, phred_quality_threshold, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError("Unknown quality filter result: %d" % quality_filter_result) continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = str(DNA(sequence).rc()) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id, seq_id, header, barcode, corrected_barcode, num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f is not None: log_str = format_split_libraries_fastq_log( count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f is not None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')