def generate_histogram(qual_fp,
                       output_dir,
                       score_min=25,
                       verbose=True):
    """ Main program function for generating quality score histogram

    qual_fp: quality score filepath
    output_dir: output directory
    score_min: minimum score to be considered a reliable base call, used 
     to generate dotted line on histogram for easy visualization of poor
     quality scores."""
    
    qual_lines = open(qual_fp, "U")
    
    qual_scores = parse_qual_score(qual_lines)
    
   
    # Sort bins according to base position
    qual_bins = bin_qual_scores(qual_scores)
    
    # Get average, std dev, and total nucleotide counts for each base position
    ave_bins, std_dev_bins, total_bases_bins, suggested_trunc_pos =\
     get_qual_stats(qual_bins, score_min)
    
    plot_qual_report(ave_bins, std_dev_bins, total_bases_bins, score_min,
     output_dir)
     
    # Save values to output text file
    write_qual_report(ave_bins, std_dev_bins, total_bases_bins, output_dir,
     suggested_trunc_pos)
     
    if verbose:
        print "Suggested nucleotide truncation position (None if quality "+\
         "score average did not fall below the minimum score parameter): %s\n"%\
         suggested_trunc_pos
Ejemplo n.º 2
0
def truncate_fasta_qual(fasta_fp, qual_fp, output_dir, base_pos):
    """ Main program function for generating quality score histogram

    fasta_fp: fasta filepath
    qual_fp: quality score filepath
    output_dir: output directory
    base_pos: Nucleotide position to truncate the fasta and quality score at.
    """

    qual_lines = open(qual_fp, "U")
    fasta_lines = open(fasta_fp, "U")

    qual_scores = parse_qual_score(qual_lines, value_cast_f=str)

    # Get dict of fasta label:seq, and the sequence order (so output can
    # be in the same order as the input sequences.
    fasta_seqs, seq_order = parse_fasta_file(fasta_lines)

    # Make sure the quality scores and fasta sequences have corresponding
    # labels and base numbers
    verify_equivalency(fasta_seqs, qual_scores)

    # Truncate seqs to base_pos index
    trunc_fasta_seqs, trunc_qual_scores = truncate_seqs(
        fasta_seqs, qual_scores, base_pos)

    # Get output filepaths
    fasta_out_fp, qual_out_fp = get_output_filepaths(output_dir, fasta_fp,
                                                     qual_fp)

    # Write truncated sequences out
    write_trunc_fasta(trunc_fasta_seqs, fasta_out_fp, seq_order)

    write_trunc_qual(trunc_qual_scores, qual_out_fp, seq_order)
Ejemplo n.º 3
0
    def test_parse_qual_score(self):
        """qual_score should return dict of {id: qual_scores}"""
        scores = StringIO('>x\n5 10 5\n12\n>y\n30 40')
        self.assertEqual(parse_qual_score(scores),
                         {'x':[5,10,5,12],'y':[30,40]})

        #Check that a bad file, e.g. a fast raises Error
        bad_scores = StringIO('>x\nabcbd\n12\n>y\GATC')
        self.assertRaises(QiimeParseError, parse_qual_score, bad_scores)
Ejemplo n.º 4
0
    def test_parse_qual_score(self):
        """qual_score should return dict of {id: qual_scores}"""
        scores = StringIO('>x\n5 10 5\n12\n>y\n30 40')
        self.assertEqual(parse_qual_score(scores), {
            'x': [5, 10, 5, 12],
            'y': [30, 40]
        })

        #Check that a bad file, e.g. a fast raises Error
        bad_scores = StringIO('>x\nabcbd\n12\n>y\GATC')
        self.assertRaises(QiimeParseError, parse_qual_score, bad_scores)
Ejemplo n.º 5
0
    def test_iter_fastq(self):
        """iter_fastq should iterate over correct # of fasta records"""
        from StringIO import StringIO
        fasta = """>M32Nstr_1 039732_1312_3088 orig_bc=CTCGTGGAGTAG new_bc=CTCGTGGAGTAG bc_diffs=0
CATGCTGCCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCCA
>F22Frhd_2 040027_1369_1966 orig_bc=CAAGTGAGAGAG new_bc=CAAGTGAGAGAG bc_diffs=0
CATGCTGCCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCCA
>F12Labi_3 040135_0934_1957 orig_bc=AGTTAGTGCGTC new_bc=AGTTAGTGCGTC bc_diffs=0
CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTACTGATCGTTGCCTTGGTGGGCCGTTACCCCGCCAACAAGCTAATCAGACGCATCCCCATCCATAACCGATAAATCTTTATTCGTAATCTCATGAGATCAAACGAATACATAAGGTATTAGTCCAACTTTGCTGGGTTAGTCCCTTACGTTATTGGGCGAGGTTGGATACGCGTTACTCACCCGTGCGCCGGTCGCCG
""".splitlines()
        qual_raw = """>039695_0364_2008 length=49 uaccno=FFLHOYS01A5986
35 35 35 35 35 35 35 35 35 32 30 30 33 33 35 35 35 35 35 34 34 34 36 36 36 36 36 35 35 36 36 36 36 36 40 37 37 37 37 38 39 38 37 38 36 35 35 35 35
>039732_1312_3088 length=271 uaccno=FFLHOYS01DHI8I
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 33 33 34 34 36 36 37 37 35 24 19 19 19 38 38 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 38 38 38 37 38 38 38 38 38 38 38 37 37 38 38 38 31 31 33 36 33 33 33 36 36 36 36 24 25 25 28 31 36 36 36 36 36 36 36 38
38 38 40 40 38 32 31 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 30 30 30 31 32 32 32
>040027_1369_1966 length=271 uaccno=FFLHOYS01DMIIO
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 34 34 34 34 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 26 26 24 38 32 22 22 15 15 15 15 15 20 16 16 16 38 38 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 34 34 34 37 37 38 28 28 27 36 33 33 33 36 36 36 36 32 32 32 33 36 36 36 38 37 37 36 37 38
38 38 38 38 38 31 31 32 32 32 32 32 32 32 32 32 32 32 32 31 28 28 28 32 31 31 31 31 32 32 32
>040135_0934_1957 length=281 uaccno=FFLHOYS01CKBO3
33 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 40 40 40 40 38 38 38 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 35 35 35 35 35 35 35 35 35 35 35 35 35 28 28
28 28 28 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 33 26 26 26 26 33 35 35 35 35 35
35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 26 26 26 30 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35
35 35 30 30 30 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 27 27 25 15 15 15 18 18 25 15 15 15 15 15 15 14 15 15 15 15 15 15 15 14 15 15 15 15 15 15 23 23 28
28 24 30 31 32 22 22 16 16 16 16 22 22 23 25 21 21 21 21 21 19 21 16 16 16 16 16 22 21 23 25 25 25 21 22 22 22 22 22 22 22
""".splitlines()
        qual = parse_qual_score(qual_raw)
        result = list(iter_fastq(fasta, qual))
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0][1], 'M32Nstr_1')
        self.assertEqual(result[1][1], 'F22Frhd_2')
        self.assertEqual(result[2][1], 'F12Labi_3')

        lines = result[0][0].splitlines()
        self.assertEqual(lines[1][:5], 'CATGC')
        self.assertEqual(lines[3][:5], chr(33 + 37) * 5)
        self.assertEqual(
            lines[3][-5:],
            ''.join(map(chr, [33 + 30, 33 + 31, 33 + 32, 33 + 32, 33 + 32])))
Ejemplo n.º 6
0
    def test_iter_fastq(self):
        """iter_fastq should iterate over correct # of fasta records"""
        from StringIO import StringIO
        fasta = """>M32Nstr_1 039732_1312_3088 orig_bc=CTCGTGGAGTAG new_bc=CTCGTGGAGTAG bc_diffs=0
CATGCTGCCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCCA
>F22Frhd_2 040027_1369_1966 orig_bc=CAAGTGAGAGAG new_bc=CAAGTGAGAGAG bc_diffs=0
CATGCTGCCTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCCA
>F12Labi_3 040135_0934_1957 orig_bc=AGTTAGTGCGTC new_bc=AGTTAGTGCGTC bc_diffs=0
CATGCTGCCTCCCGTAGGAGTTTGGACCGTGTCTCAGTTCCAATGTGGGGGACCTTCCTCTCAGAACCCCTACTGATCGTTGCCTTGGTGGGCCGTTACCCCGCCAACAAGCTAATCAGACGCATCCCCATCCATAACCGATAAATCTTTATTCGTAATCTCATGAGATCAAACGAATACATAAGGTATTAGTCCAACTTTGCTGGGTTAGTCCCTTACGTTATTGGGCGAGGTTGGATACGCGTTACTCACCCGTGCGCCGGTCGCCG
""".splitlines()
        qual_raw = """>039695_0364_2008 length=49 uaccno=FFLHOYS01A5986
35 35 35 35 35 35 35 35 35 32 30 30 33 33 35 35 35 35 35 34 34 34 36 36 36 36 36 35 35 36 36 36 36 36 40 37 37 37 37 38 39 38 37 38 36 35 35 35 35
>039732_1312_3088 length=271 uaccno=FFLHOYS01DHI8I
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 33 33 34 34 36 36 37 37 35 24 19 19 19 38 38 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 38 38 38 37 38 38 38 38 38 38 38 37 37 38 38 38 31 31 33 36 33 33 33 36 36 36 36 24 25 25 28 31 36 36 36 36 36 36 36 38
38 38 40 40 38 32 31 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 30 30 30 31 32 32 32
>040027_1369_1966 length=271 uaccno=FFLHOYS01DMIIO
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 34 34 34 34 37 37 37 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 26 26 24 38 32 22 22 15 15 15 15 15 20 16 16 16 38 38 37 37 37
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 34 34 34 37 37 38 28 28 27 36 33 33 33 36 36 36 36 32 32 32 33 36 36 36 38 37 37 36 37 38
38 38 38 38 38 31 31 32 32 32 32 32 32 32 32 32 32 32 32 31 28 28 28 32 31 31 31 31 32 32 32
>040135_0934_1957 length=281 uaccno=FFLHOYS01CKBO3
33 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 40 40 40 40 38 38 38 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 35 35 35 35 35 35 35 35 35 35 35 35 35 28 28
28 28 28 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 33 26 26 26 26 33 35 35 35 35 35
35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 26 26 26 30 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35
35 35 30 30 30 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 27 27 25 15 15 15 18 18 25 15 15 15 15 15 15 14 15 15 15 15 15 15 15 14 15 15 15 15 15 15 23 23 28
28 24 30 31 32 22 22 16 16 16 16 22 22 23 25 21 21 21 21 21 19 21 16 16 16 16 16 22 21 23 25 25 25 21 22 22 22 22 22 22 22
""".splitlines()
        qual = parse_qual_score(qual_raw)
        result = list(iter_fastq(fasta, qual))
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0][1], 'M32Nstr_1')
        self.assertEqual(result[1][1], 'F22Frhd_2')
        self.assertEqual(result[2][1], 'F12Labi_3')

        lines = result[0][0].splitlines()
        self.assertEqual(lines[1][:5], 'CATGC')
        self.assertEqual(lines[3][:5], chr(33 + 37) * 5)
        self.assertEqual(
            lines[3][-5:], ''.join(map(chr, [33 + 30, 33 + 31, 33 + 32, 33 + 32, 33 + 32])))
Ejemplo n.º 7
0
def truncate_fasta_qual(fasta_fp, 
                        qual_fp,
                        output_dir,
                        base_pos):
    """ Main program function for generating quality score histogram
    
    fasta_fp: fasta filepath
    qual_fp: quality score filepath
    output_dir: output directory
    base_pos: Nucleotide position to truncate the fasta and quality score at.
    """
    
    qual_lines = open(qual_fp, "U")
    fasta_lines = open(fasta_fp, "U")
    
    qual_scores = parse_qual_score(qual_lines,value_cast_f=str)
    
    # Get dict of fasta label:seq, and the sequence order (so output can
    # be in the same order as the input sequences.
    fasta_seqs, seq_order = parse_fasta_file(fasta_lines)
    
    
    # Make sure the quality scores and fasta sequences have corresponding
    # labels and base numbers
    verify_equivalency(fasta_seqs, qual_scores)
    
    # Truncate seqs to base_pos index
    trunc_fasta_seqs, trunc_qual_scores = truncate_seqs(fasta_seqs, 
     qual_scores, base_pos)
     
    # Get output filepaths
    fasta_out_fp, qual_out_fp = get_output_filepaths(output_dir, fasta_fp,
     qual_fp)
     
    # Write truncated sequences out
    write_trunc_fasta(trunc_fasta_seqs, fasta_out_fp, seq_order)
    
    write_trunc_qual(trunc_qual_scores, qual_out_fp, seq_order)
Ejemplo n.º 8
0
def generate_histogram(qual_fp, output_dir, score_min=25):
    """ Main program function for generating quality score histogram

    qual_fp: quality score filepath
    output_dir: output directory
    score_min: minimum score to be considered a reliable base call, used 
     to generate dotted line on histogram for easy visualization of poor
     quality scores."""

    qual_lines = open(qual_fp, "U")

    qual_scores = parse_qual_score(qual_lines)

    # Sort bins according to base position
    qual_bins = bin_qual_scores(qual_scores)

    # Get average, std dev, and total nucleotide counts for each base position
    ave_bins, std_dev_bins, total_bases_bins = get_qual_stats(qual_bins)

    plot_qual_report(ave_bins, std_dev_bins, total_bases_bins, score_min,
                     output_dir)

    # Save values to output text file
    write_qual_report(ave_bins, std_dev_bins, total_bases_bins, output_dir)