Example #1
0
def process_uclust_pw_alignment_results(fasta_pairs_lines,uc_lines):
    """ Process results of uclust search and align """
    alignments = get_next_two_fasta_records(fasta_pairs_lines)
    for hit in get_next_record_type(uc_lines,'H'):
        matching_strand = hit[4]
        if matching_strand == '-':
            strand_id = '-'
            target_rev_match = True
        elif matching_strand == '+':
            strand_id = '+'
            target_rev_match = False
        elif matching_strand == '.':
            # protein sequence, so no strand information
            strand_id = ''
            target_rev_match = False
        else:
            raise UclustParseError, "Unknown strand type: %s" % matching_strand
        uc_query_id = hit[8]
        uc_target_id = hit[9]
        percent_id = float(hit[3])
        
        fasta_pair = alignments.next()
        
        fasta_query_id = fasta_pair[0][0]
        aligned_query = fasta_pair[0][1]
        
        if fasta_query_id != uc_query_id:
            raise UclustParseError,\
             "Order of fasta and uc files do not match."+\
             " Got query %s but expected %s." %\
              (fasta_query_id, uc_query_id)
            
        fasta_target_id = fasta_pair[1][0]
        aligned_target = fasta_pair[1][1]
            
        if fasta_target_id != uc_target_id + strand_id:
            raise UclustParseError, \
             "Order of fasta and uc files do not match."+\
             " Got target %s but expected %s." %\
              (fasta_target_id, uc_target_id + strand_id)
            
        if target_rev_match:
            query_id = uc_query_id + ' RC'
            aligned_query = DNA.rc(aligned_query)
            target_id = uc_target_id
            aligned_target = DNA.rc(aligned_target)
        else:
            query_id = uc_query_id
            aligned_query = aligned_query
            target_id = uc_target_id
            aligned_target = aligned_target
            
        yield (query_id, target_id, aligned_query, aligned_target, percent_id)
Example #2
0
def process_uclust_pw_alignment_results(fasta_pairs_lines, uc_lines):
    """ Process results of uclust search and align """
    alignments = get_next_two_fasta_records(fasta_pairs_lines)
    for hit in get_next_record_type(uc_lines, 'H'):
        matching_strand = hit[4]
        if matching_strand == '-':
            strand_id = '-'
            target_rev_match = True
        elif matching_strand == '+':
            strand_id = '+'
            target_rev_match = False
        elif matching_strand == '.':
            # protein sequence, so no strand information
            strand_id = ''
            target_rev_match = False
        else:
            raise UclustParseError, "Unknown strand type: %s" % matching_strand
        uc_query_id = hit[8]
        uc_target_id = hit[9]
        percent_id = float(hit[3])

        fasta_pair = alignments.next()

        fasta_query_id = fasta_pair[0][0]
        aligned_query = fasta_pair[0][1]

        if fasta_query_id != uc_query_id:
            raise UclustParseError,\
             "Order of fasta and uc files do not match."+\
             " Got query %s but expected %s." %\
              (fasta_query_id, uc_query_id)

        fasta_target_id = fasta_pair[1][0]
        aligned_target = fasta_pair[1][1]

        if fasta_target_id != uc_target_id + strand_id:
            raise UclustParseError, \
             "Order of fasta and uc files do not match."+\
             " Got target %s but expected %s." %\
              (fasta_target_id, uc_target_id + strand_id)

        if target_rev_match:
            query_id = uc_query_id + ' RC'
            aligned_query = DNA.rc(aligned_query)
            target_id = uc_target_id
            aligned_target = DNA.rc(aligned_target)
        else:
            query_id = uc_query_id
            aligned_query = aligned_query
            target_id = uc_target_id
            aligned_target = aligned_target

        yield (query_id, target_id, aligned_query, aligned_target, percent_id)
def parse_illumina_single_end_read_file(read_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,
    rev_comp,rev_comp_barcode,barcode_in_seq,barcode_max_N=0,seq_max_N=0):
    """Parses Illumina single-end read file
    """
    
    for read_line in read_file:
        read = parse_illumina_line(read_line,barcode_length,
                                   rev_comp_barcode,barcode_in_seq)
        
        read_desc = illumina_read_description_from_read_data(read)
        
        read_barcode = read['Barcode']
        
        if read_barcode.count('N') > barcode_max_N:
           continue
        
        seq, qual = read_qual_score_filter(\
         read['Sequence'], read['Quality Score'],\
         max_bad_run_length, quality_threshold)
         
        if (len(seq) < min_per_read_length) or (seq.count('N') > seq_max_N):
            continue
            
        if rev_comp:
            seq = DNA.rc(seq)
            qual = qual[::-1]
        
        yield read_desc, read_barcode, seq, qual
Example #4
0
def process_barcode_single_end_data(read1_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    bc1_len=6,
                                    rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence
    
    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = DNA.rc(bc_read)
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(read1_data[header_index],
                                    read1_data[sequence_index][bc1_len:],
                                    read1_data[quality_index][bc1_len:])
    output_fastq1.write(seq_lines)

    return
Example #5
0
def parse_illumina_line(l,barcode_length,rev_comp_barcode,
                        barcode_in_sequence=False):
    """Parses a single line of Illumina data
    """
    fields = l.strip().split(':')
    
    y_position_subfields = fields[4].split('#')
    y_position = int(y_position_subfields[0])
    sequence = fields[5]
    qual_string = fields[6]
    
    if barcode_in_sequence:
        barcode = sequence[:barcode_length]
        sequence = sequence[barcode_length:]
        qual_string = qual_string[barcode_length:]
    else:
        barcode = y_position_subfields[1][:barcode_length]
    
    if rev_comp_barcode:
        barcode = DNA.rc(barcode)
    
    result = {\
     'Full description':':'.join(fields[:5]),\
     'Machine Name':fields[0],\
     'Channel Number':int(fields[1]),\
     'Tile Number':int(fields[2]),\
     'X Position':int(fields[3]),\
     'Y Position':y_position,\
     'Barcode':barcode,\
     'Full Y Position Field':fields[4],\
     'Sequence':sequence,\
     'Quality Score':qual_string}
     
    return result
Example #6
0
def parse_illumina_line(l, barcode_length, rev_comp_barcode,
                        barcode_in_sequence=False):
    """Parses a single line of Illumina data
    """
    fields = l.strip().split(':')

    y_position_subfields = fields[4].split('#')
    y_position = int(y_position_subfields[0])
    sequence = fields[5]
    qual_string = fields[6]

    if barcode_in_sequence:
        barcode = sequence[:barcode_length]
        sequence = sequence[barcode_length:]
        qual_string = qual_string[barcode_length:]
    else:
        barcode = y_position_subfields[1][:barcode_length]

    if rev_comp_barcode:
        barcode = DNA.rc(barcode)

    result = {
        'Full description': ':'.join(fields[:5]),
        'Machine Name': fields[0],
        'Channel Number': int(fields[1]),
        'Tile Number': int(fields[2]),
        'X Position': int(fields[3]),
        'Y Position': y_position,
        'Barcode': barcode,
        'Full Y Position Field': fields[4],
        'Sequence': sequence,
        'Quality Score': qual_string}

    return result
Example #7
0
def process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len=6, rev_comp_bc1=False):
    """ Processes, writes single-end barcode data, parsed sequence
    
    read1_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of data
    rev_comp_bc1: reverse complement barcode before writing.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    bc_read = read1_data[sequence_index][:bc1_len]
    bc_qual = read1_data[quality_index][:bc1_len]
    if rev_comp_bc1:
        bc_read = DNA.rc(bc_read)
        bc_qual = bc_qual[::-1]

    bc_lines = format_fastq_record(read1_data[header_index], bc_read, bc_qual)
    output_bc_fastq.write(bc_lines)
    seq_lines = format_fastq_record(
        read1_data[header_index], read1_data[sequence_index][bc1_len:], read1_data[quality_index][bc1_len:]
    )
    output_fastq1.write(seq_lines)

    return
Example #8
0
def rc_fasta_lines(fasta_lines, seq_desc_mapper=append_rc):
    """
    """
    for seq_id, seq in parse_fasta(fasta_lines):
        seq_id = seq_desc_mapper(seq_id)
        seq = DNA.rc(seq.upper())
        yield seq_id, seq
    return
Example #9
0
def get_reverse_primers(id_map):
    """ Return a dictionary with barcodes and rev-complement of rev primers """
    
    rev_primers = {}
    for n in id_map.items():
        # Generate a dictionary with Barcode:reverse primer
        # Convert to reverse complement of the primer so its in the 
        # proper orientation with the input fasta sequences
        rev_primers[n[1]['BarcodeSequence']]=DNA.rc(n[1]['ReversePrimer'])
        
    return rev_primers
def parse_illumina_paired_end_read_files(read1_file,read2_file,barcode_length,\
    max_bad_run_length,quality_threshold,min_per_read_length,rev_comp_barcode,\
    barcode_max_N=0,seq_max_N=0):
    """Parses Illumina paired-end read file pair
    """
    
    for read1_line, read2_line in izip(read1_file,read2_file):
        read1 = parse_illumina_line(read1_line,barcode_length,rev_comp_barcode)
        read2 = parse_illumina_line(read2_line,barcode_length,rev_comp_barcode)
        
        read1_desc = illumina_read_description_from_read_data(read1)
        read2_desc = illumina_read_description_from_read_data(read2)
        
        read1_barcode = read1['Barcode']
        read2_barcode = read2['Barcode']
        if (read1_barcode.count('N') > barcode_max_N) or \
           (read2_barcode.count('N') > barcode_max_N):
           continue
        
        if read1_desc != read2_desc:
            raise IlluminaParseError, \
              "Error in sequence files, descriptions of"+\
              " corresponding lines are not compatible: %s != %s" %\
              (read1_desc, read2_desc)
        assert read1_barcode == read2_barcode
        
        seq1, qual1 = read_qual_score_filter(\
         read1['Sequence'], read1['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq1) < min_per_read_length):
            continue
            
        seq2, qual2 = read_qual_score_filter(\
         read2['Sequence'], read2['Quality Score'],\
         max_bad_run_length, quality_threshold)
        if (len(seq2) < min_per_read_length):
            continue
            
        seq = seq1 + DNA.rc(seq2)
        # If the total number of Ns is more than the max 
        # allowed ignore this sequence
        if seq.count('N') > seq_max_N:
            continue
        qual = qual1 + qual2[::-1]
        
        yield read1_desc, read1_barcode, seq, qual
Example #11
0
def get_rev_primer_seqs(mapping_fp):
    """ Parses mapping file to get dictionary of SampleID:Rev primer
    mapping_fp:  mapping filepath
    """
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_fp, has_barcodes=False,
         disable_primer_check=True)
        
    if errors:
        for curr_err in errors:
            if curr_err.startswith("Duplicate SampleID"):
                raise ValueError,('Errors were found with mapping file, '+\
                 'please run check_id_map.py to identify problems.')
         
    # create dict of dicts with SampleID:{each header:mapping data}
    
    id_map = {}
    
    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}
        
    
    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]
    
    reverse_primers = {}
    
    for curr_id in id_map.keys():
        try:
            reverse_primers[curr_id] =\
             [DNA.rc(curr_rev_primer) for curr_rev_primer in\
             id_map[curr_id]['ReversePrimer'].split(',')]
        except KeyError:
            raise KeyError,("Reverse primer not found in mapping file, "+\
             "please include a 'ReversePrimer' column.")

             
    # Check for valid reverse primers
    # Will have been detected as warnings from mapping file
    for curr_err in errors:
        if curr_err.startswith("Invalid DNA sequence detected"):
            raise ValueError,("Problems found with reverse primers, please "+\
             "check mapping file with check_id_map.py")
    
    return reverse_primers
Example #12
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_id = opts.sample_id
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_id == None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide a --sample_id.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass
    
    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None
    
    if opts.last_bad_quality_char != None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')
    
    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
         '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)
    
    if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir
    
    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass
    
    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
        def fastq_writer(h,s,q):
            pass
    
    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')
    
    for sequence_read_fp, barcode_read_fp, mapping_fp in\
      zip(sequence_read_fps, barcode_read_fps, mapping_fps):
        mapping_f = open(mapping_fp, 'U')
        h, i, barcode_to_sample_id, warnings, errors, p, a =\
           check_map(mapping_f, 
                     disable_primer_check=True, 
                     has_barcodes=barcode_read_fp != None)
        
        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = \
             dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])
        
        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction, or pass --barcode_type # if the barcodes "+\
                "are not 12 base pairs, where # is the size of the barcodes. "+
                "  Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))
        
        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
       
        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp,'U')
        

        seq_id = start_seq_id
        
        if barcode_read_fp != None:
            
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
              (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
              
            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp,'U')
            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f,
               sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)
        
        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
            qual_writer(fasta_header,quality)
            fastq_writer(fasta_header,sequence,quality)
            
        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')
        
    output_f.close()
    rename(output_fp_temp,output_fp)
    
    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp,qual_fp)
    
    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp,output_fastq_fp)
Example #13
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            option_parser.error("Only valid phred offsets are: %s" %
                                ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file
        # figure it out...
        phred_to_ascii_f = None

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
                            '0 and 1 (inclusive). You passed %1.5f' %
                            min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                DNA.rc(k): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_to_ascii_f=phred_to_ascii_f)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_to_ascii_f=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2

    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]

    # determine the version of casava that was used to generate the fastq
    # to determine how to compare header lines and decode ascii phred scores
    post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
    if post_casava_v180:
        check_header_match_f = check_header_match_180_or_later
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii33
    else:
        check_header_match_f = check_header_match_pre180
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii64

    # determine the last unacceptable quality character
    if phred_quality_threshold != None:
        last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold)
    else:
        # disable quality filter
        last_bad_quality_char = ''

    # compute the barcode length, if they are all the same.
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set(
        [len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None

    # compute the minimum read length as a fraction of the length of the input read
    min_per_read_length = min_per_read_length_fraction * len(
        fastq_read_f_line2)

    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data, read_data in izip(
            MinimalFastqParser(fastq_barcode_f, strict=False),
            MinimalFastqParser(fastq_read_f, strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index],read_data[header_index])):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]

        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for
            # techical reasons, this step looks only at the
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]

        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
         correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
            count_barcode_errors_exceed_max += 1
            continue

        # skip unassignable samples unless otherwise requested
        if sample_id == None:
            if not store_unassigned:
                count_barcode_not_in_map += 1
                continue
            else:
                sample_id = 'Unassigned'

        quality_filter_result, sequence, quality =\
          quality_filter_sequence(header,
                                  sequence,
                                  quality,
                                  max_bad_run_length,
                                  last_bad_quality_char,
                                  min_per_read_length,
                                  seq_max_N,
                                  filter_bad_illumina_qual_digit)

        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError,\
                 "Unknown quality filter result: %d" % quality_filter_result
            continue

        sequence_lengths.append(len(sequence))

        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1

        if rev_comp:
            sequence = DNA.rc(sequence)
            quality = quality[::-1]

        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
          (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    # Add sample IDs with zero counts to dictionary for logging
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0

    if log_f != None:
        log_str = format_split_libraries_fastq_log(
            count_barcode_not_in_map, count_too_short, count_too_many_N,
            count_bad_illumina_qual_digit, count_barcode_errors_exceed_max,
            input_sequence_count, sequence_lengths, seqs_per_sample_counts)
        log_f.write(log_str)

    if len(sequence_lengths) and histogram_f != None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts, bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       last_bad_quality_char='B',
                                       min_per_read_length=75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=True,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2
    
    seq_id = start_seq_id
    
    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    
    for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
                                  MinimalFastqParser(fastq_read_f,strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if not check_header_match(bc_data[header_index],
                                  read_data[header_index]):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]
        
        # Grab the barcode sequence
        barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]
        
        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
         correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
          count_barcode_errors_exceed_max += 1
          continue
        
        # skip unassignable samples unless otherwise requested
        if sample_id == None:
          if not store_unassigned:
              count_barcode_not_in_map += 1
              continue
          else:
              sample_id = 'Unassigned'
        
        quality_filter_result, sequence, quality =\
          quality_filter_sequence(header,
                                  sequence,
                                  quality,
                                  max_bad_run_length,
                                  last_bad_quality_char,
                                  min_per_read_length,
                                  seq_max_N,
                                  filter_bad_illumina_qual_digit)
        
        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and 
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError,\
                 "Unknown quality filter result: %d" % quality_filter_result
            continue
        
        sequence_lengths.append(len(sequence))
        
        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1
        
        if rev_comp:
            sequence = DNA.rc(sequence)
            quality = quality[::-1]
        
        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
          (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    if log_f != None:
        log_str = format_split_libraries_fastq_log(count_barcode_not_in_map,
                             count_too_short,
                             count_too_many_N,
                             count_bad_illumina_qual_digit,
                             count_barcode_errors_exceed_max,
                             input_sequence_count,
                             sequence_lengths,
                             seqs_per_sample_counts)
        log_f.write(log_str)
    
    if len(sequence_lengths) and histogram_f != None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts,bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
Example #16
0
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_to_ascii_f=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2
    
    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]
    
    # determine the version of casava that was used to generate the fastq
    # to determine how to compare header lines and decode ascii phred scores
    post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
    if post_casava_v180:
        check_header_match_f = check_header_match_180_or_later
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii33
    else:
        check_header_match_f = check_header_match_pre180
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii64
    
    # determine the last unacceptable quality character
    if phred_quality_threshold != None:
        last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold)
    else:
        # disable quality filter
        last_bad_quality_char = ''
    
    # compute the barcode length, if they are all the same. 
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None
    
    # compute the minimum read length as a fraction of the length of the input read
    min_per_read_length = min_per_read_length_fraction * len(fastq_read_f_line2)
    
    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data,read_data in izip(MinimalFastqParser(fastq_barcode_f,strict=False),
                                  MinimalFastqParser(fastq_read_f,strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index],read_data[header_index])):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]
        
        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for 
            # techical reasons, this step looks only at the 
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]
        
        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
         correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
          count_barcode_errors_exceed_max += 1
          continue
        
        # skip unassignable samples unless otherwise requested
        if sample_id == None:
          if not store_unassigned:
              count_barcode_not_in_map += 1
              continue
          else:
              sample_id = 'Unassigned'
        
        quality_filter_result, sequence, quality =\
          quality_filter_sequence(header,
                                  sequence,
                                  quality,
                                  max_bad_run_length,
                                  last_bad_quality_char,
                                  min_per_read_length,
                                  seq_max_N,
                                  filter_bad_illumina_qual_digit)
        
        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and 
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError,\
                 "Unknown quality filter result: %d" % quality_filter_result
            continue
        
        sequence_lengths.append(len(sequence))
        
        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1
        
        if rev_comp:
            sequence = DNA.rc(sequence)
            quality = quality[::-1]
        
        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
          (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1
    
    # Add sample IDs with zero counts to dictionary for logging    
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0
    

    if log_f != None:
        log_str = format_split_libraries_fastq_log(count_barcode_not_in_map,
                             count_too_short,
                             count_too_many_N,
                             count_bad_illumina_qual_digit,
                             count_barcode_errors_exceed_max,
                             input_sequence_count,
                             sequence_lengths,
                             seqs_per_sample_counts)
        log_f.write(log_str)
    
    if len(sequence_lengths) and histogram_f != None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts,bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
Example #17
0
def process_barcode_paired_end_data(
    read1_data,
    read2_data,
    output_bc_fastq,
    output_fastq1,
    output_fastq2,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    attempt_read_orientation=False,
    forward_primers=None,
    reverse_primers=None,
    output_bc_not_oriented=None,
    fastq1_out_not_oriented=None,
    fastq2_out_not_oriented=None,
):
    """ Processes, writes paired-end barcode data, parsed sequences
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True
                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                break
        # Check reverse primers if forward primers not found
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    break
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    bc_read1 = read1[sequence_index][0:bc1_len]
    bc_read2 = read2[sequence_index][0:bc2_len]
    bc_qual1 = read1[quality_index][0:bc1_len]
    bc_qual2 = read2[quality_index][0:bc2_len]
    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
        bc_qual2 = bc_qual2[::-1]

    bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2)
    output_bc.write(bc_lines)
    seq1_lines = format_fastq_record(
        read1[header_index], read1[sequence_index][bc1_len:], read1[quality_index][bc1_len:]
    )
    output_read1.write(seq1_lines)
    seq2_lines = format_fastq_record(
        read2[header_index], read2[sequence_index][bc2_len:], read2[quality_index][bc2_len:]
    )
    output_read2.write(seq2_lines)

    return
Example #18
0
def get_primers(header, mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators
    
    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data
    
    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError, (
            "Mapping file is missing LinkerPrimerSequence field.")
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError, ("Mapping file is missing ReversePrimer field.")

    iupac = {
        'A': 'A',
        'T': 'T',
        'G': 'G',
        'C': 'C',
        'R': '[AG]',
        'Y': '[CT]',
        'S': '[GC]',
        'W': '[AT]',
        'K': '[GT]',
        'M': '[AC]',
        'B': '[CGT]',
        'D': '[AGT]',
        'H': '[ACT]',
        'V': '[ACG]',
        'N': '[ACGT]'
    }

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update([upper(primer).strip() for\
            primer in line[primer_ix].split(',')])
        raw_forward_rc_primers.update([DNA.rc(primer) for\
            primer in raw_forward_primers])
        raw_reverse_primers.update([upper(primer).strip() for \
            primer in line[rev_primer_ix].split(',')])
        raw_reverse_rc_primers.update([DNA.rc(primer) for\
            primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError, ("No forward primers detected in mapping file.")
    if not raw_reverse_primers:
        raise ValueError, ("No reverse primers detected in mapping file.")

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(compile(''.join([iupac[symbol] for\
            symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(compile(''.join([iupac[symbol] for\
            symbol in curr_primer])))

    return forward_primers, reverse_primers
Example #19
0
def process_barcode_in_label(read1_data,
                             read2_data,
                             output_bc_fastq,
                             bc1_len=6,
                             bc2_len=6,
                             rev_comp_bc1=False,
                             rev_comp_bc2=False,
                             char_delineator=":"):
    """ Reads data from one or two fastq labels, writes output barcodes file. 
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(
            char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError,("Found sequence lacking character delineator. "
            "Sequence header %s, character delineator %s" %\
            (read1_data[header_index], char_delineator))

    # Create fake quality scores
    bc1_qual = "F" * len(bc1_read)
    if rev_comp_bc1:
        bc1_read = DNA.rc(bc1_read)

    if read2_data:
        bc2_read =\
            read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len]
        bc2_qual = "F" * len(bc2_read)
        if rev_comp_bc2:
            bc2_read = DNA.rc(bc2_read)
    else:
        bc2_read = ""
        bc2_qual = ""

    if not bc1_read and not bc2_read:
        raise ValueError, ("Came up with empty barcode sequence, please check "
                           "character delineator with -s, and fastq label "
                           "%s" % read1_data[header_index])

    bc_lines = format_fastq_record(read1_data[header_index],
                                   bc1_read + bc2_read, bc1_qual + bc2_qual)

    output_bc_fastq.write(bc_lines)

    return
Example #20
0
def process_barcode_paired_stitched(read_data,
                                    output_bc_fastq,
                                    output_fastq,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq_out_not_oriented=None,
                                    switch_bc_order=False):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read
    
    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be 
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = DNA.rc(read_seq)
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index],
                                   bc_read1 + bc_read2, bc_qual1 + bc_qual2)
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index],
                                    read_seq[bc1_len:-bc2_len],
                                    read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return
Example #21
0
def process_barcode_paired_end_data(read1_data,
                                    read2_data,
                                    output_bc_fastq,
                                    output_fastq1,
                                    output_fastq2,
                                    bc1_len=6,
                                    bc2_len=6,
                                    rev_comp_bc1=False,
                                    rev_comp_bc2=False,
                                    attempt_read_orientation=False,
                                    forward_primers=None,
                                    reverse_primers=None,
                                    output_bc_not_oriented=None,
                                    fastq1_out_not_oriented=None,
                                    fastq2_out_not_oriented=None):
    """ Processes, writes paired-end barcode data, parsed sequences
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq1: open output fastq reads 1 filepath
    output_fastq2: open output fastq reads 2 filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq1_out_not_oriented: Open filepath to write reads 1 where primers
        can't be found when attempt_read_orientation is True.
    fastq2_out_not_oriented: Open filepath to write reads 2 where primers
        can't be found when attempt_read_orientation is True.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        # First check forward primers
        for curr_primer in forward_primers:
            if curr_primer.search(read1_data[sequence_index]):
                read1 = read1_data
                read2 = read2_data
                found_primer_match = True
                break
            if curr_primer.search(read2_data[sequence_index]):
                read1 = read2_data
                read2 = read1_data
                found_primer_match = True
                break
        # Check reverse primers if forward primers not found
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read1_data[sequence_index]):
                    read1 = read2_data
                    read2 = read1_data
                    found_primer_match = True
                    break
                if curr_primer.search(read2_data[sequence_index]):
                    read1 = read1_data
                    read2 = read2_data
                    found_primer_match = True
                    break
    else:
        read1 = read1_data
        read2 = read2_data

    if not found_primer_match and attempt_read_orientation:
        read1 = read1_data
        read2 = read2_data
        output_bc = output_bc_not_oriented
        output_read1 = fastq1_out_not_oriented
        output_read2 = fastq2_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read1 = output_fastq1
        output_read2 = output_fastq2

    bc_read1 = read1[sequence_index][0:bc1_len]
    bc_read2 = read2[sequence_index][0:bc2_len]
    bc_qual1 = read1[quality_index][0:bc1_len]
    bc_qual2 = read2[quality_index][0:bc2_len]
    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
        bc_qual2 = bc_qual2[::-1]

    bc_lines = format_fastq_record(read1[header_index], bc_read1 + bc_read2,
                                   bc_qual1 + bc_qual2)
    output_bc.write(bc_lines)
    seq1_lines = format_fastq_record(read1[header_index],
                                     read1[sequence_index][bc1_len:],
                                     read1[quality_index][bc1_len:])
    output_read1.write(seq1_lines)
    seq2_lines = format_fastq_record(read2[header_index],
                                     read2[sequence_index][bc2_len:],
                                     read2[quality_index][bc2_len:])
    output_read2.write(seq2_lines)

    return
Example #22
0
def process_barcode_in_label(
    read1_data,
    read2_data,
    output_bc_fastq,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    char_delineator=":",
):
    """ Reads data from one or two fastq labels, writes output barcodes file. 
    
    read1_data: list of header, read, quality scores
    read2_data: list of header, read, quality scores, False if no read 2.
    output_bc_fastq: open output fastq filepath
    bc1_len: length of barcode to remove from beginning of read1 data
    bc2_len: length of barcode to remove from beginning of read2 data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    # Check for char_delineator in sequence
    try:
        bc1_read = read1_data[header_index].split(char_delineator)[-1][0:bc1_len]
    # If there is an index error, it means the char_delineator wasn't found
    except IndexError:
        raise IndexError, (
            "Found sequence lacking character delineator. "
            "Sequence header %s, character delineator %s" % (read1_data[header_index], char_delineator)
        )

    # Create fake quality scores
    bc1_qual = "F" * len(bc1_read)
    if rev_comp_bc1:
        bc1_read = DNA.rc(bc1_read)

    if read2_data:
        bc2_read = read2_data[header_index].strip().split(char_delineator)[-1][0:bc2_len]
        bc2_qual = "F" * len(bc2_read)
        if rev_comp_bc2:
            bc2_read = DNA.rc(bc2_read)
    else:
        bc2_read = ""
        bc2_qual = ""

    if not bc1_read and not bc2_read:
        raise ValueError, (
            "Came up with empty barcode sequence, please check "
            "character delineator with -s, and fastq label "
            "%s" % read1_data[header_index]
        )

    bc_lines = format_fastq_record(read1_data[header_index], bc1_read + bc2_read, bc1_qual + bc2_qual)

    output_bc_fastq.write(bc_lines)

    return
Example #23
0
def get_primers(header, mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators
    
    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data
    
    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError, ("Mapping file is missing LinkerPrimerSequence field.")
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError, ("Mapping file is missing ReversePrimer field.")

    iupac = {
        "A": "A",
        "T": "T",
        "G": "G",
        "C": "C",
        "R": "[AG]",
        "Y": "[CT]",
        "S": "[GC]",
        "W": "[AT]",
        "K": "[GT]",
        "M": "[AC]",
        "B": "[CGT]",
        "D": "[AGT]",
        "H": "[ACT]",
        "V": "[ACG]",
        "N": "[ACGT]",
    }

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update([upper(primer).strip() for primer in line[primer_ix].split(",")])
        raw_forward_rc_primers.update([DNA.rc(primer) for primer in raw_forward_primers])
        raw_reverse_primers.update([upper(primer).strip() for primer in line[rev_primer_ix].split(",")])
        raw_reverse_rc_primers.update([DNA.rc(primer) for primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError, ("No forward primers detected in mapping file.")
    if not raw_reverse_primers:
        raise ValueError, ("No reverse primers detected in mapping file.")

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(compile("".join([iupac[symbol] for symbol in curr_primer])))

    return forward_primers, reverse_primers
Example #24
0
def process_barcode_paired_stitched(
    read_data,
    output_bc_fastq,
    output_fastq,
    bc1_len=6,
    bc2_len=6,
    rev_comp_bc1=False,
    rev_comp_bc2=False,
    attempt_read_orientation=False,
    forward_primers=None,
    reverse_primers=None,
    output_bc_not_oriented=None,
    fastq_out_not_oriented=None,
    switch_bc_order=False,
):
    """ Processes stitched barcoded reads, writes barcode, parsed stitched read
    
    read_data: list of header, read, quality scores
    output_bc_fastq: open output fastq filepath
    output_fastq: open output fastq reads filepath
    bc1_len: length of barcode to remove from beginning of read1 stitched data
    bc2_len: length of barcode to remove from end of read2 stitched data
    rev_comp_bc1: reverse complement barcode 1 before writing.
    rev_comp_bc2: reverse complement barcode 2 before writing.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    forward_primers: list of regular expression generators, forward primers
    reverse_primers: list of regular expression generators, reverse primers
    output_bc_not_oriented: Barcode output from reads that are not oriented
    fastq_out_not_oriented: Open filepath to write reads where primers
        can't be found when attempt_read_orientation is True.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be 
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    """

    header_index = 0
    sequence_index = 1
    quality_index = 2

    read_seq = read_data[sequence_index]
    read_qual = read_data[quality_index]

    found_primer_match = False
    # Break from orientation search as soon as a match is found
    if attempt_read_orientation:
        for curr_primer in forward_primers:
            if curr_primer.search(read_data[sequence_index]):
                found_primer_match = True
                break
        if not found_primer_match:
            for curr_primer in reverse_primers:
                if curr_primer.search(read_data[sequence_index]):
                    read_seq = DNA.rc(read_seq)
                    read_qual = read_qual[::-1]
                    found_primer_match = True
                    break

    if not found_primer_match and attempt_read_orientation:
        output_bc = output_bc_not_oriented
        output_read = fastq_out_not_oriented
    else:
        output_bc = output_bc_fastq
        output_read = output_fastq

    bc_read1 = read_seq[0:bc1_len]
    bc_read2 = read_seq[-bc2_len:]
    bc_qual1 = read_qual[0:bc1_len]
    bc_qual2 = read_qual[-bc2_len:]

    if rev_comp_bc1:
        bc_read1 = DNA.rc(bc_read1)
        bc_qual1 = bc_qual1[::-1]
    if rev_comp_bc2:
        bc_read2 = DNA.rc(bc_read2)
        bc_qual2 = bc_qual2[::-1]

    if switch_bc_order:
        bc_read1, bc_read2 = bc_read2, bc_read1
        bc_qual1, bc_qual2 = bc_qual2, bc_qual1

    bc_lines = format_fastq_record(read_data[header_index], bc_read1 + bc_read2, bc_qual1 + bc_qual2)
    output_bc.write(bc_lines)
    seq_lines = format_fastq_record(read_data[header_index], read_seq[bc1_len:-bc2_len], read_qual[bc1_len:-bc2_len])
    output_read.write(seq_lines)

    return
Example #25
0
def get_primers(header,
                mapping_data):
    """ Returns lists of forward/reverse primer regular expression generators

    header:  list of strings of header data.
    mapping_data:  list of lists of mapping data

    Will raise error if either the LinkerPrimerSequence or ReversePrimer fields
        are not present
    """

    if "LinkerPrimerSequence" in header:
        primer_ix = header.index("LinkerPrimerSequence")
    else:
        raise IndexError(
            ("Mapping file is missing LinkerPrimerSequence field."))
    if "ReversePrimer" in header:
        rev_primer_ix = header.index("ReversePrimer")
    else:
        raise IndexError(("Mapping file is missing ReversePrimer field."))

    iupac = {'A': 'A', 'T': 'T', 'G': 'G', 'C': 'C', 'R': '[AG]', 'Y': '[CT]',
             'S': '[GC]', 'W': '[AT]', 'K': '[GT]', 'M': '[AC]', 'B': '[CGT]',
             'D': '[AGT]', 'H': '[ACT]', 'V': '[ACG]', 'N': '[ACGT]'}

    raw_forward_primers = set([])
    raw_forward_rc_primers = set([])
    raw_reverse_primers = set([])
    raw_reverse_rc_primers = set([])

    for line in mapping_data:
        # Split on commas to handle pool of primers
        raw_forward_primers.update([upper(primer).strip() for
                                    primer in line[primer_ix].split(',')])
        raw_forward_rc_primers.update([DNA.rc(primer) for
                                       primer in raw_forward_primers])
        raw_reverse_primers.update([upper(primer).strip() for
                                    primer in line[rev_primer_ix].split(',')])
        raw_reverse_rc_primers.update([DNA.rc(primer) for
                                       primer in raw_reverse_primers])

    if not raw_forward_primers:
        raise ValueError(("No forward primers detected in mapping file."))
    if not raw_reverse_primers:
        raise ValueError(("No reverse primers detected in mapping file."))

    # Finding the forward primers, or rc of reverse primers indicates forward
    # read. Finding the reverse primer, or rc of the forward primers, indicates
    # the reverse read, so these sets are merged.
    raw_forward_primers.update(raw_reverse_rc_primers)
    raw_reverse_primers.update(raw_forward_rc_primers)

    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(compile(''.join([iupac[symbol] for
                                                symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(compile(''.join([iupac[symbol] for
                                                symbol in curr_primer])))

    return forward_primers, reverse_primers