def test_correct_barcode_golay_correction(self):
     """correct_barcode functions as expected w golay correction"""
     barcode = "GGAGACAAGGGT"
     barcode_to_sample_id = {
      "GGAGACAAGGGA":"s1",
      "ACACCTGGTGAT":"s2"}
     correction_fn = decode_golay_12
     
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (1, "GGAGACAAGGGA", True, "s1")
     self.assertEqual(actual,expected)
     
     barcode = "ACACCTGGTGAC"
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (1, "ACACCTGGTGAT", True, "s2")
     self.assertEqual(actual,expected)
     
     # valid code, but not in barcode_to_sample_id map
     barcode = "CCAGTGTATGCA"
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (0, "CCAGTGTATGCA", True, None)
     self.assertEqual(actual,expected)
     
     # invalid code, corrected not in barcode_to_sample_id map
     barcode = "CCTGTGTATGCA"
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (1, "CCAGTGTATGCA", True, None)
     self.assertEqual(actual,expected)
 def test_correct_barcode_exact_match(self):
     """correct_barcode functions as expected w exact match"""
     barcode = "GGAGACAAGGGA"
     barcode_to_sample_id = {
      "GGAGACAAGGGA":"s1",
      "ACACCTGGTGAT":"s2"}
     correction_fn=None
     
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (0, barcode, False, 's1')
     self.assertEqual(actual,expected)
     
     correction_fn = decode_golay_12
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (0, barcode, False, 's1')
     self.assertEqual(actual,expected)
 def test_correct_barcode_no_error_correction(self):
     """correct_barcode functions as expected w no error correction"""
     barcode = "GGAGACAAGGGT"
     barcode_to_sample_id = {
      "GGAGACAAGGGA":"s1",
      "ACACCTGGTGAT":"s2"}
     correction_fn=None
     
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (0, barcode, False, None)
     self.assertEqual(actual,expected)
     
     # barcode contains N
     barcode = "CCAGTGTANGCA"
     actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn)
     expected = (0, "CCAGTGTANGCA", False, None)
     self.assertEqual(actual,expected)
Ejemplo n.º 4
0
def read_fwd_rev_read(fwd_read_f,
                      rev_read_f,
                      bc_to_sid,
                      barcode_len,
                      barcode_correction_fn,
                      bc_to_fwd_primers,
                      bc_to_rev_primers,
                      max_barcode_errors,
                      fwd_length,
                      rev_length):
    """
    Reads fwd and rev read fastq files
    Parameters
    ----------
    fwd_read_f: file
        forward read fastq file
    rev_read_f: file
        reverse read fastq file
    bc_to_sid: dict
    barcode_len: int
        barcode length
    barcode_correction_fn: function
        applicable only for gloay_12 barcodes
    bc_to_fwd_primers: dict
    bc_to_rev_primers: dict
    max_barcode_errors: int
        maximum allowable errors in barcodes, applicable for golay_12
    fwd_length: int
        standard length, used for truncating of the forward sequence
    rev_length: int
        standard length, used for truncating of the reverse sequence
    Returns
    ----------
    random_bc_lookup: defaultdict
        contains sample ID -> random barcode -> list of seqs
    random_bc_reads: defaultdict
        contains sample ID -> random barcode -> number of reads
    random_bcs: list
    barcode_errors_exceed_max_count: int
    barcode_not_in_map_count: int
    primer_mismatch_count: int
    seq_too_short_count: int
    input_seqs_count: int
    total_seqs_kept: int
    """
    random_bc_lookup = defaultdict(lambda:
                                   defaultdict(lambda:
                                               defaultdict(int)))

    random_bc_reads = defaultdict(lambda:
                                  defaultdict(int))

    random_bcs = {}

    # Counts for Quality Control:
    input_seqs_count = 0
    total_seqs_kept_count = 0
    barcode_errors_exceed_max_count = 0
    barcode_not_in_map_count = 0
    primer_mismatch_count = 0
    seq_too_short_count = 0
    input_seqs_count = 0
    total_seqs_kept = 0

    header_idx = 0
    seq_idx = 1
    qual_idx = 2

    for fwd_read, rev_read in izip(parse_fastq(fwd_read_f, strict=False,
                                   enforce_qual_range=False),
                                   parse_fastq(rev_read_f,
                                   strict=False,
                                   enforce_qual_range=False)):

        # confirm match between headers

        input_seqs_count += 1

        if fwd_read[header_idx] != rev_read[header_idx]:
            raise PairedEndParseError(
                "Headers of forward and reverse reads "
                "do not match. Confirm that the forward "
                "and reverse read fastq files that you "
                "provided have headers that match one "
                "another.")
        else:
            header = fwd_read[header_idx]

        fwd_seq = fwd_read[seq_idx]
        rev_seq = rev_read[seq_idx]

        #  Grab the barcode sequence. It is always at the very end of the
        #  forward read. Strip the barcode from the sequence.
        barcode = fwd_seq[-barcode_len:]
        fwd_seq = fwd_seq[:-barcode_len]

        #  Correct the barcode(if applicable) and map to sample ID.
        num_barcode_errors, corrected_barcode, _, sample_id =\
            correct_barcode(barcode, bc_to_sid, barcode_correction_fn)

        #  Skip barcodes with too many errors.
        if num_barcode_errors > max_barcode_errors:
            barcode_errors_exceed_max_count += 1
            continue

        if sample_id is None:
            barcode_not_in_map_count += 1
            continue

        # Extract the random barcode and primer from the forward read.
        possible_primers = bc_to_fwd_primers[corrected_barcode].keys()

        try:
            random_bc, _, clean_fwd_seq = extract_primer(fwd_seq,
                                                         possible_primers,
                                                         min_idx=5,
                                                         max_idx=20)

            random_bcs[sample_id].append(random_bc)
        except PrimerMismatchError:
            primer_mismatch_count += 1
            continue
        except KeyError:
            random_bcs[sample_id] = list()
            random_bcs[sample_id].append(random_bc)

        possible_primers = bc_to_rev_primers[barcode]

        try:
            phase_seq, _, clean_rev_seq = extract_primer(rev_seq,
                                                         possible_primers)
        except PrimerMismatchError:
            primer_mismatch_count += 1
            continue

        if len(clean_fwd_seq) < fwd_length:
            seq_too_short_count += 1
            continue

        clean_fwd_seq = clean_fwd_seq[:fwd_length]
        clean_rev_seq = clean_rev_seq[:rev_length]

        total_seqs_kept += 1
        random_bc_reads[sample_id][random_bc] += 1
        random_bc_lookup[sample_id][random_bc][
            (clean_fwd_seq, clean_rev_seq)] += 1

    return (random_bc_lookup,
            random_bc_reads,
            random_bcs,
            barcode_errors_exceed_max_count,
            barcode_not_in_map_count,
            primer_mismatch_count,
            seq_too_short_count,
            input_seqs_count,
            total_seqs_kept)