def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines): """ Process results of uclust search and align """ alignments = get_next_two_fasta_records(fasta_pairs_lines) for hit in get_next_record_type(uc_lines,'H'): matching_strand = hit[4] if matching_strand == '-': strand_id = '-' target_rev_match = True elif matching_strand == '+': strand_id = '+' target_rev_match = False elif matching_strand == '.': # protein sequence, so no strand information strand_id = '' target_rev_match = False else: raise UclustParseError, "Unknown strand type: %s" % matching_strand uc_query_id = hit[8] uc_target_id = hit[9] percent_id = float(hit[3]) fasta_pair = alignments.next() fasta_query_id = fasta_pair[0][0] aligned_query = fasta_pair[0][1] if fasta_query_id != uc_query_id: raise UclustParseError,\ "Order of fasta and uc files do not match."+\ " Got query %s but expected %s." %\ (fasta_query_id, uc_query_id) fasta_target_id = fasta_pair[1][0] aligned_target = fasta_pair[1][1] if fasta_target_id != uc_target_id + strand_id: raise UclustParseError, \ "Order of fasta and uc files do not match."+\ " Got target %s but expected %s." %\ (fasta_target_id, uc_target_id + strand_id) if target_rev_match: query_id = uc_query_id + ' RC' aligned_query = DNA.rc(aligned_query) target_id = uc_target_id aligned_target = DNA.rc(aligned_target) else: query_id = uc_query_id aligned_query = aligned_query target_id = uc_target_id aligned_target = aligned_target yield (query_id, target_id, aligned_query, aligned_target, percent_id)
def process_uclust_pw_alignment_results(fasta_pairs_lines, uc_lines): """ Process results of uclust search and align """ alignments = get_next_two_fasta_records(fasta_pairs_lines) for hit in get_next_record_type(uc_lines, 'H'): matching_strand = hit[4] if matching_strand == '-': strand_id = '-' target_rev_match = True elif matching_strand == '+': strand_id = '+' target_rev_match = False elif matching_strand == '.': # protein sequence, so no strand information strand_id = '' target_rev_match = False else: raise UclustParseError, "Unknown strand type: %s" % matching_strand uc_query_id = hit[8] uc_target_id = hit[9] percent_id = float(hit[3]) fasta_pair = alignments.next() fasta_query_id = fasta_pair[0][0] aligned_query = fasta_pair[0][1] if fasta_query_id != uc_query_id: raise UclustParseError,\ "Order of fasta and uc files do not match."+\ " Got query %s but expected %s." %\ (fasta_query_id, uc_query_id) fasta_target_id = fasta_pair[1][0] aligned_target = fasta_pair[1][1] if fasta_target_id != uc_target_id + strand_id: raise UclustParseError, \ "Order of fasta and uc files do not match."+\ " Got target %s but expected %s." %\ (fasta_target_id, uc_target_id + strand_id) if target_rev_match: query_id = uc_query_id + ' RC' aligned_query = DNA.rc(aligned_query) target_id = uc_target_id aligned_target = DNA.rc(aligned_target) else: query_id = uc_query_id aligned_query = aligned_query target_id = uc_target_id aligned_target = aligned_target yield (query_id, target_id, aligned_query, aligned_target, percent_id)
def parse_illumina_single_end_read_file(read_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length, rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0): """Parses Illumina single-end read file """ for read_line in read_file: read = parse_illumina_line(read_line,barcode_length, rev_comp_barcode,barcode_in_seq) read_desc = illumina_read_description_from_read_data(read) read_barcode = read['Barcode'] if read_barcode.count('N') > barcode_max_N: continue seq, qual = read_qual_score_filter(\ read['Sequence'], read['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N): continue if rev_comp: seq = DNA.rc(seq) qual = qual[::-1] yield read_desc, read_barcode, seq, qual
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = DNA.rc(bc_read) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record(read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]) output_fastq1.write(seq_lines) return
def parse_illumina_line(l,barcode_length,rev_comp_barcode, barcode_in_sequence=False): """Parses a single line of Illumina data """ fields = l.strip().split(':') y_position_subfields = fields[4].split('#') y_position = int(y_position_subfields[0]) sequence = fields[5] qual_string = fields[6] if barcode_in_sequence: barcode = sequence[:barcode_length] sequence = sequence[barcode_length:] qual_string = qual_string[barcode_length:] else: barcode = y_position_subfields[1][:barcode_length] if rev_comp_barcode: barcode = DNA.rc(barcode) result = {\ 'Full description':':'.join(fields[:5]),\ 'Machine Name':fields[0],\ 'Channel Number':int(fields[1]),\ 'Tile Number':int(fields[2]),\ 'X Position':int(fields[3]),\ 'Y Position':y_position,\ 'Barcode':barcode,\ 'Full Y Position Field':fields[4],\ 'Sequence':sequence,\ 'Quality Score':qual_string} return result
def parse_illumina_line(l, barcode_length, rev_comp_barcode, barcode_in_sequence=False): """Parses a single line of Illumina data """ fields = l.strip().split(':') y_position_subfields = fields[4].split('#') y_position = int(y_position_subfields[0]) sequence = fields[5] qual_string = fields[6] if barcode_in_sequence: barcode = sequence[:barcode_length] sequence = sequence[barcode_length:] qual_string = qual_string[barcode_length:] else: barcode = y_position_subfields[1][:barcode_length] if rev_comp_barcode: barcode = DNA.rc(barcode) result = { 'Full description': ':'.join(fields[:5]), 'Machine Name': fields[0], 'Channel Number': int(fields[1]), 'Tile Number': int(fields[2]), 'X Position': int(fields[3]), 'Y Position': y_position, 'Barcode': barcode, 'Full Y Position Field': fields[4], 'Sequence': sequence, 'Quality Score': qual_string} return result
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False): """ Processes, writes single-end barcode data, parsed sequence read1_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of data rev_comp_bc1: reverse complement barcode before writing. """ header_index = 0 sequence_index = 1 quality_index = 2 bc_read = read1_data[sequence_index][:bc1_len] bc_qual = read1_data[quality_index][:bc1_len] if rev_comp_bc1: bc_read = DNA.rc(bc_read) bc_qual = bc_qual[::-1] bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual) output_bc_fastq.write(bc_lines) seq_lines = format_fastq_record( read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:] ) output_fastq1.write(seq_lines) return
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc): """ """ for seq_id, seq in parse_fasta(fasta_lines): seq_id = seq_desc_mapper(seq_id) seq = DNA.rc(seq.upper()) yield seq_id, seq return
def get_reverse_primers(id_map): """ Return a dictionary with barcodes and rev-complement of rev primers """ rev_primers = {} for n in id_map.items(): # Generate a dictionary with Barcode:reverse primer # Convert to reverse complement of the primer so its in the # proper orientation with the input fasta sequences rev_primers[n[1]['BarcodeSequence']]=DNA.rc(n[1]['ReversePrimer']) return rev_primers
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\ max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\ barcode_max_N=0,seq_max_N=0): """Parses Illumina paired-end read file pair """ for read1_line, read2_line in izip(read1_file,read2_file): read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode) read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode) read1_desc = illumina_read_description_from_read_data(read1) read2_desc = illumina_read_description_from_read_data(read2) read1_barcode = read1['Barcode'] read2_barcode = read2['Barcode'] if (read1_barcode.count('N') > barcode_max_N) or \ (read2_barcode.count('N') > barcode_max_N): continue if read1_desc != read2_desc: raise IlluminaParseError, \ "Error in sequence files, descriptions of"+\ " corresponding lines are not compatible: %s != %s" %\ (read1_desc, read2_desc) assert read1_barcode == read2_barcode seq1, qual1 = read_qual_score_filter(\ read1['Sequence'], read1['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq1) < min_per_read_length): continue seq2, qual2 = read_qual_score_filter(\ read2['Sequence'], read2['Quality Score'],\ max_bad_run_length, quality_threshold) if (len(seq2) < min_per_read_length): continue seq = seq1 + DNA.rc(seq2) # If the total number of Ns is more than the max # allowed ignore this sequence if seq.count('N') > seq_max_N: continue qual = qual1 + qual2[::-1] yield read1_desc, read1_barcode, seq, qual
def get_rev_primer_seqs(mapping_fp): """ Parses mapping file to get dictionary of SampleID:Rev primer mapping_fp: mapping filepath """ hds, mapping_data, run_description, errors, warnings = \ process_id_map(mapping_fp, has_barcodes=False, disable_primer_check=True) if errors: for curr_err in errors: if curr_err.startswith("Duplicate SampleID"): raise ValueError,('Errors were found with mapping file, '+\ 'please run check_id_map.py to identify problems.') # create dict of dicts with SampleID:{each header:mapping data} id_map = {} for curr_data in mapping_data: id_map[curr_data[0]] = {} for header in range(len(hds)): for curr_data in mapping_data: id_map[curr_data[0]][hds[header]] = curr_data[header] reverse_primers = {} for curr_id in id_map.keys(): try: reverse_primers[curr_id] =\ [DNA.rc(curr_rev_primer) for curr_rev_primer in\ id_map[curr_id]['ReversePrimer'].split(',')] except KeyError: raise KeyError,("Reverse primer not found in mapping file, "+\ "please include a 'ReversePrimer' column.") # Check for valid reverse primers # Will have been detected as warnings from mapping file for curr_err in errors: if curr_err.startswith("Invalid DNA sequence detected"): raise ValueError,("Problems found with reverse primers, please "+\ "check mapping file with check_id_map.py") return reverse_primers
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_id = opts.sample_id mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_id == None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide a --sample_id.") barcode_read_fps = [None] * len(sequence_read_fps) elif barcode_read_fps == None: option_parser.error("Must provide --barcode_fps if " "--barcode_type is not 'not-barcoded'") else: pass phred_offset = opts.phred_offset if phred_offset != None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side opption_parser.error(\ "Only valid phred offsets are: %s" %\ ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char != None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) try: barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type] except KeyError: barcode_correction_fn = None if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode and mapping files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp,'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp,'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h,q): qual_f.write('>%s\n%s\n' % (h,q)) else: def qual_writer(h,q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp,'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h,s,q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q)) else: def fastq_writer(h,s,q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp,'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp,'w') for sequence_read_fp, barcode_read_fp, mapping_fp in\ zip(sequence_read_fps, barcode_read_fps, mapping_fps): mapping_f = open(mapping_fp, 'U') h, i, barcode_to_sample_id, warnings, errors, p, a =\ check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp != None) if rev_comp_mapping_barcodes: barcode_to_sample_id = \ dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()]) if barcode_type == 'golay_12': invalid_golay_barcodes = \ get_invalid_golay_barcodes(barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay codes. "+\ "Do they need to be reverse complimented? If these are not "+\ "golay barcodes pass --barcode_type 12 to disable barcode "+\ "error correction, or pass --barcode_type # if the barcodes "+\ "are not 12 base pairs, where # is the size of the barcodes. "+ " Invalid codes:\n\t%s" % \ ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") log_f.write('Mapping filepath: %s (md5: %s)\n' %\ (mapping_fp,safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' %\ (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp,'U') seq_id = start_seq_id if barcode_read_fp != None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\ (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp,'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header,sequence)) qual_writer(fasta_header,quality) fastq_writer(fasta_header,sequence,quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp,output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp,qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp,output_fastq_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side option_parser.error("Only valid phred offsets are: %s" % ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { DNA.rc(k): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_to_ascii_f=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] # determine the version of casava that was used to generate the fastq # to determine how to compare header lines and decode ascii phred scores post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: check_header_match_f = check_header_match_180_or_later if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii33 else: check_header_match_f = check_header_match_pre180 if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii64 # determine the last unacceptable quality character if phred_quality_threshold != None: last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold) else: # disable quality filter last_bad_quality_char = '' # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set( [len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input read min_per_read_length = min_per_read_length_fraction * len( fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( MinimalFastqParser(fastq_barcode_f, strict=False), MinimalFastqParser(fastq_read_f, strict=False)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index],read_data[header_index])): raise FastqParseError,\ ("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = DNA.rc(barcode) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id == None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, last_bad_quality_char, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError,\ "Unknown quality filter result: %d" % quality_filter_result continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = DNA.rc(sequence) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f != None: log_str = format_split_libraries_fastq_log( count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f != None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, last_bad_quality_char='B', min_per_read_length=75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=True, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False), MinimalFastqParser(fastq_read_f,strict=False)): input_sequence_count += 1 # Confirm match between barcode and read headers if not check_header_match(bc_data[header_index], read_data[header_index]): raise FastqParseError,\ ("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = DNA.rc(barcode) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id == None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, last_bad_quality_char, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError,\ "Unknown quality filter result: %d" % quality_filter_result continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = DNA.rc(sequence) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 if log_f != None: log_str = format_split_libraries_fastq_log(count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f != None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts,bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_to_ascii_f=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] # determine the version of casava that was used to generate the fastq # to determine how to compare header lines and decode ascii phred scores post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: check_header_match_f = check_header_match_180_or_later if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii33 else: check_header_match_f = check_header_match_pre180 if phred_to_ascii_f == None: phred_to_ascii_f = phred_to_ascii64 # determine the last unacceptable quality character if phred_quality_threshold != None: last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold) else: # disable quality filter last_bad_quality_char = '' # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input read min_per_read_length = min_per_read_length_fraction * len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False), MinimalFastqParser(fastq_read_f,strict=False)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index],read_data[header_index])): raise FastqParseError,\ ("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = DNA.rc(barcode) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id == None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, last_bad_quality_char, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError,\ "Unknown quality filter result: %d" % quality_filter_result continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = DNA.rc(sequence) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f != None: log_str = format_split_libraries_fastq_log(count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f != None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts,bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def process_barcode_paired_end_data( read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq1_out_not_oriented=None, fastq2_out_not_oriented=None, ): """ Processes, writes paired-end barcode data, parsed sequences read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads 1 filepath output_fastq2: open output fastq reads 2 filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq1_out_not_oriented: Open filepath to write reads 1 where primers can't be found when attempt_read_orientation is True. fastq2_out_not_oriented: Open filepath to write reads 2 where primers can't be found when attempt_read_orientation is True. """ header_index = 0 sequence_index = 1 quality_index = 2 found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: # First check forward primers for curr_primer in forward_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break # Check reverse primers if forward primers not found if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break else: read1 = read1_data read2 = read2_data if not found_primer_match and attempt_read_orientation: read1 = read1_data read2 = read2_data output_bc = output_bc_not_oriented output_read1 = fastq1_out_not_oriented output_read2 = fastq2_out_not_oriented else: output_bc = output_bc_fastq output_read1 = output_fastq1 output_read2 = output_fastq2 bc_read1 = read1[sequence_index][0:bc1_len] bc_read2 = read2[sequence_index][0:bc2_len] bc_qual1 = read1[quality_index][0:bc1_len] bc_qual2 = read2[quality_index][0:bc2_len] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq1_lines = format_fastq_record( read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:] ) output_read1.write(seq1_lines) seq2_lines = format_fastq_record( read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:] ) output_read2.write(seq2_lines) return
def get_primers(header, mapping_data): """ Returns lists of forward/reverse primer regular expression generators header: list of strings of header data. mapping_data: list of lists of mapping data Will raise error if either the LinkerPrimerSequence or ReversePrimer fields are not present """ if "LinkerPrimerSequence" in header: primer_ix = header.index("LinkerPrimerSequence") else: raise IndexError, ( "Mapping file is missing LinkerPrimerSequence field.") if "ReversePrimer" in header: rev_primer_ix = header.index("ReversePrimer") else: raise IndexError, ("Mapping file is missing ReversePrimer field.") iupac = { 'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]', 'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]' } raw_forward_primers = set([]) raw_forward_rc_primers = set([]) raw_reverse_primers = set([]) raw_reverse_rc_primers = set([]) for line in mapping_data: # Split on commas to handle pool of primers raw_forward_primers.update([upper(primer).strip() for\ primer in line[primer_ix].split(',')]) raw_forward_rc_primers.update([DNA.rc(primer) for\ primer in raw_forward_primers]) raw_reverse_primers.update([upper(primer).strip() for \ primer in line[rev_primer_ix].split(',')]) raw_reverse_rc_primers.update([DNA.rc(primer) for\ primer in raw_reverse_primers]) if not raw_forward_primers: raise ValueError, ("No forward primers detected in mapping file.") if not raw_reverse_primers: raise ValueError, ("No reverse primers detected in mapping file.") # Finding the forward primers, or rc of reverse primers indicates forward # read. Finding the reverse primer, or rc of the forward primers, indicates # the reverse read, so these sets are merged. raw_forward_primers.update(raw_reverse_rc_primers) raw_reverse_primers.update(raw_forward_rc_primers) forward_primers = [] reverse_primers = [] for curr_primer in raw_forward_primers: forward_primers.append(compile(''.join([iupac[symbol] for\ symbol in curr_primer]))) for curr_primer in raw_reverse_primers: reverse_primers.append(compile(''.join([iupac[symbol] for\ symbol in curr_primer]))) return forward_primers, reverse_primers
def process_barcode_in_label(read1_data, read2_data, output_bc_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":"): """ Reads data from one or two fastq labels, writes output barcodes file. read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores, False if no read 2. output_bc_fastq: open output fastq filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. """ header_index = 0 sequence_index = 1 quality_index = 2 # Check for char_delineator in sequence try: bc1_read = read1_data[header_index].split( char_delineator)[-1][0:bc1_len] # If there is an index error, it means the char_delineator wasn't found except IndexError: raise IndexError,("Found sequence lacking character delineator. " "Sequence header %s, character delineator %s" %\ (read1_data[header_index], char_delineator)) # Create fake quality scores bc1_qual = "F" * len(bc1_read) if rev_comp_bc1: bc1_read = DNA.rc(bc1_read) if read2_data: bc2_read =\ read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len] bc2_qual = "F" * len(bc2_read) if rev_comp_bc2: bc2_read = DNA.rc(bc2_read) else: bc2_read = "" bc2_qual = "" if not bc1_read and not bc2_read: raise ValueError, ("Came up with empty barcode sequence, please check " "character delineator with -s, and fastq label " "%s" % read1_data[header_index]) bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual) output_bc_fastq.write(bc_lines) return
def process_barcode_paired_stitched(read_data, output_bc_fastq, output_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq_out_not_oriented=None, switch_bc_order=False): """ Processes stitched barcoded reads, writes barcode, parsed stitched read read_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of read1 stitched data bc2_len: length of barcode to remove from end of read2 stitched data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq_out_not_oriented: Open filepath to write reads where primers can't be found when attempt_read_orientation is True. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. """ header_index = 0 sequence_index = 1 quality_index = 2 read_seq = read_data[sequence_index] read_qual = read_data[quality_index] found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: for curr_primer in forward_primers: if curr_primer.search(read_data[sequence_index]): found_primer_match = True break if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read_data[sequence_index]): read_seq = DNA.rc(read_seq) read_qual = read_qual[::-1] found_primer_match = True break if not found_primer_match and attempt_read_orientation: output_bc = output_bc_not_oriented output_read = fastq_out_not_oriented else: output_bc = output_bc_fastq output_read = output_fastq bc_read1 = read_seq[0:bc1_len] bc_read2 = read_seq[-bc2_len:] bc_qual1 = read_qual[0:bc1_len] bc_qual2 = read_qual[-bc2_len:] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] if switch_bc_order: bc_read1, bc_read2 = bc_read2, bc_read1 bc_qual1, bc_qual2 = bc_qual2, bc_qual1 bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len]) output_read.write(seq_lines) return
def process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq1_out_not_oriented=None, fastq2_out_not_oriented=None): """ Processes, writes paired-end barcode data, parsed sequences read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq1: open output fastq reads 1 filepath output_fastq2: open output fastq reads 2 filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq1_out_not_oriented: Open filepath to write reads 1 where primers can't be found when attempt_read_orientation is True. fastq2_out_not_oriented: Open filepath to write reads 2 where primers can't be found when attempt_read_orientation is True. """ header_index = 0 sequence_index = 1 quality_index = 2 found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: # First check forward primers for curr_primer in forward_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break # Check reverse primers if forward primers not found if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read1_data[sequence_index]): read1 = read2_data read2 = read1_data found_primer_match = True break if curr_primer.search(read2_data[sequence_index]): read1 = read1_data read2 = read2_data found_primer_match = True break else: read1 = read1_data read2 = read2_data if not found_primer_match and attempt_read_orientation: read1 = read1_data read2 = read2_data output_bc = output_bc_not_oriented output_read1 = fastq1_out_not_oriented output_read2 = fastq2_out_not_oriented else: output_bc = output_bc_fastq output_read1 = output_fastq1 output_read2 = output_fastq2 bc_read1 = read1[sequence_index][0:bc1_len] bc_read2 = read2[sequence_index][0:bc2_len] bc_qual1 = read1[quality_index][0:bc1_len] bc_qual2 = read2[quality_index][0:bc2_len] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq1_lines = format_fastq_record(read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:]) output_read1.write(seq1_lines) seq2_lines = format_fastq_record(read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:]) output_read2.write(seq2_lines) return
def process_barcode_in_label( read1_data, read2_data, output_bc_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", ): """ Reads data from one or two fastq labels, writes output barcodes file. read1_data: list of header, read, quality scores read2_data: list of header, read, quality scores, False if no read 2. output_bc_fastq: open output fastq filepath bc1_len: length of barcode to remove from beginning of read1 data bc2_len: length of barcode to remove from beginning of read2 data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. """ header_index = 0 sequence_index = 1 quality_index = 2 # Check for char_delineator in sequence try: bc1_read = read1_data[header_index].split(char_delineator)[-1][0:bc1_len] # If there is an index error, it means the char_delineator wasn't found except IndexError: raise IndexError, ( "Found sequence lacking character delineator. " "Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator) ) # Create fake quality scores bc1_qual = "F" * len(bc1_read) if rev_comp_bc1: bc1_read = DNA.rc(bc1_read) if read2_data: bc2_read = read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len] bc2_qual = "F" * len(bc2_read) if rev_comp_bc2: bc2_read = DNA.rc(bc2_read) else: bc2_read = "" bc2_qual = "" if not bc1_read and not bc2_read: raise ValueError, ( "Came up with empty barcode sequence, please check " "character delineator with -s, and fastq label " "%s" % read1_data[header_index] ) bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual) output_bc_fastq.write(bc_lines) return
def get_primers(header, mapping_data): """ Returns lists of forward/reverse primer regular expression generators header: list of strings of header data. mapping_data: list of lists of mapping data Will raise error if either the LinkerPrimerSequence or ReversePrimer fields are not present """ if "LinkerPrimerSequence" in header: primer_ix = header.index("LinkerPrimerSequence") else: raise IndexError, ("Mapping file is missing LinkerPrimerSequence field.") if "ReversePrimer" in header: rev_primer_ix = header.index("ReversePrimer") else: raise IndexError, ("Mapping file is missing ReversePrimer field.") iupac = { "A": "A", "T": "T", "G": "G", "C": "C", "R": "[AG]", "Y": "[CT]", "S": "[GC]", "W": "[AT]", "K": "[GT]", "M": "[AC]", "B": "[CGT]", "D": "[AGT]", "H": "[ACT]", "V": "[ACG]", "N": "[ACGT]", } raw_forward_primers = set([]) raw_forward_rc_primers = set([]) raw_reverse_primers = set([]) raw_reverse_rc_primers = set([]) for line in mapping_data: # Split on commas to handle pool of primers raw_forward_primers.update([upper(primer).strip() for primer in line[primer_ix].split(",")]) raw_forward_rc_primers.update([DNA.rc(primer) for primer in raw_forward_primers]) raw_reverse_primers.update([upper(primer).strip() for primer in line[rev_primer_ix].split(",")]) raw_reverse_rc_primers.update([DNA.rc(primer) for primer in raw_reverse_primers]) if not raw_forward_primers: raise ValueError, ("No forward primers detected in mapping file.") if not raw_reverse_primers: raise ValueError, ("No reverse primers detected in mapping file.") # Finding the forward primers, or rc of reverse primers indicates forward # read. Finding the reverse primer, or rc of the forward primers, indicates # the reverse read, so these sets are merged. raw_forward_primers.update(raw_reverse_rc_primers) raw_reverse_primers.update(raw_forward_rc_primers) forward_primers = [] reverse_primers = [] for curr_primer in raw_forward_primers: forward_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer]))) for curr_primer in raw_reverse_primers: reverse_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer]))) return forward_primers, reverse_primers
def process_barcode_paired_stitched( read_data, output_bc_fastq, output_fastq, bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, attempt_read_orientation=False, forward_primers=None, reverse_primers=None, output_bc_not_oriented=None, fastq_out_not_oriented=None, switch_bc_order=False, ): """ Processes stitched barcoded reads, writes barcode, parsed stitched read read_data: list of header, read, quality scores output_bc_fastq: open output fastq filepath output_fastq: open output fastq reads filepath bc1_len: length of barcode to remove from beginning of read1 stitched data bc2_len: length of barcode to remove from end of read2 stitched data rev_comp_bc1: reverse complement barcode 1 before writing. rev_comp_bc2: reverse complement barcode 2 before writing. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. forward_primers: list of regular expression generators, forward primers reverse_primers: list of regular expression generators, reverse primers output_bc_not_oriented: Barcode output from reads that are not oriented fastq_out_not_oriented: Open filepath to write reads where primers can't be found when attempt_read_orientation is True. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. """ header_index = 0 sequence_index = 1 quality_index = 2 read_seq = read_data[sequence_index] read_qual = read_data[quality_index] found_primer_match = False # Break from orientation search as soon as a match is found if attempt_read_orientation: for curr_primer in forward_primers: if curr_primer.search(read_data[sequence_index]): found_primer_match = True break if not found_primer_match: for curr_primer in reverse_primers: if curr_primer.search(read_data[sequence_index]): read_seq = DNA.rc(read_seq) read_qual = read_qual[::-1] found_primer_match = True break if not found_primer_match and attempt_read_orientation: output_bc = output_bc_not_oriented output_read = fastq_out_not_oriented else: output_bc = output_bc_fastq output_read = output_fastq bc_read1 = read_seq[0:bc1_len] bc_read2 = read_seq[-bc2_len:] bc_qual1 = read_qual[0:bc1_len] bc_qual2 = read_qual[-bc2_len:] if rev_comp_bc1: bc_read1 = DNA.rc(bc_read1) bc_qual1 = bc_qual1[::-1] if rev_comp_bc2: bc_read2 = DNA.rc(bc_read2) bc_qual2 = bc_qual2[::-1] if switch_bc_order: bc_read1, bc_read2 = bc_read2, bc_read1 bc_qual1, bc_qual2 = bc_qual2, bc_qual1 bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2) output_bc.write(bc_lines) seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len]) output_read.write(seq_lines) return
def get_primers(header, mapping_data): """ Returns lists of forward/reverse primer regular expression generators header: list of strings of header data. mapping_data: list of lists of mapping data Will raise error if either the LinkerPrimerSequence or ReversePrimer fields are not present """ if "LinkerPrimerSequence" in header: primer_ix = header.index("LinkerPrimerSequence") else: raise IndexError( ("Mapping file is missing LinkerPrimerSequence field.")) if "ReversePrimer" in header: rev_primer_ix = header.index("ReversePrimer") else: raise IndexError(("Mapping file is missing ReversePrimer field.")) iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]', 'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]', 'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'} raw_forward_primers = set([]) raw_forward_rc_primers = set([]) raw_reverse_primers = set([]) raw_reverse_rc_primers = set([]) for line in mapping_data: # Split on commas to handle pool of primers raw_forward_primers.update([upper(primer).strip() for primer in line[primer_ix].split(',')]) raw_forward_rc_primers.update([DNA.rc(primer) for primer in raw_forward_primers]) raw_reverse_primers.update([upper(primer).strip() for primer in line[rev_primer_ix].split(',')]) raw_reverse_rc_primers.update([DNA.rc(primer) for primer in raw_reverse_primers]) if not raw_forward_primers: raise ValueError(("No forward primers detected in mapping file.")) if not raw_reverse_primers: raise ValueError(("No reverse primers detected in mapping file.")) # Finding the forward primers, or rc of reverse primers indicates forward # read. Finding the reverse primer, or rc of the forward primers, indicates # the reverse read, so these sets are merged. raw_forward_primers.update(raw_reverse_rc_primers) raw_reverse_primers.update(raw_forward_rc_primers) forward_primers = [] reverse_primers = [] for curr_primer in raw_forward_primers: forward_primers.append(compile(''.join([iupac[symbol] for symbol in curr_primer]))) for curr_primer in raw_reverse_primers: reverse_primers.append(compile(''.join([iupac[symbol] for symbol in curr_primer]))) return forward_primers, reverse_primers