Esempio n. 1
0
File: qa.py Progetto: wkopp/htseq
def main():

    try:
        import matplotlib
    except ImportError:
        sys.stderr.write("This script needs the 'matplotlib' library, which ")
        sys.stderr.write("was not found. Please install it." )
    matplotlib.use('PDF')
    from matplotlib import pyplot

    # Matplotlib <1.5 uses normalize, so this block will be deprecated
    try:
        from matplotlib.pyplot import Normalize
    except ImportError:
        from matplotlib.pyplot import normalize as Normalize


    # **** Parse command line ****

    optParser = optparse.OptionParser( usage = "%prog [options] read_file",
        description=
        "This script take a file with high-throughput sequencing reads " +
        "(supported formats: SAM, Solexa _export.txt, FASTQ, Solexa " +
        "_sequence.txt) and performs a simply quality assessment by " +
        "producing plots showing the distribution of called bases and " +
        "base-call quality scores by position within the reads. The " +
        "plots are output as a PDF file.",
        epilog =
        "Written by Simon Anders ([email protected]), European Molecular Biology " +
        " Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
        " Public License v3. Part of the 'HTSeq' framework, version %s." % HTSeq.__version__ )
    optParser.add_option( "-t", "--type", type="choice", dest="type",
        choices = ("sam", "bam", "solexa-export", "fastq", "solexa-fastq"),
        default = "sam", help="type of read_file (one of: sam [default], bam, " +
        "solexa-export, fastq, solexa-fastq)" )
    optParser.add_option( "-o", "--outfile", type="string", dest="outfile",
        help="output filename (default is <read_file>.pdf)" )
    optParser.add_option( "-r", "--readlength", type="int", dest="readlen",
        help="the maximum read length (when not specified, the script guesses from the file" )
    optParser.add_option( "-g", "--gamma", type="float", dest="gamma",
        default = 0.3,
        help="the gamma factor for the contrast adjustment of the quality score plot" )
    optParser.add_option( "-n", "--nosplit", action="store_true", dest="nosplit",
        help="do not split reads in unaligned and aligned ones" )
    optParser.add_option( "-m", "--maxqual", type="int", dest="maxqual", default=41,
        help="the maximum quality score that appears in the data (default: 41)" )

    if len( sys.argv ) == 1:
        optParser.print_help()
        sys.exit(1)

    (opts, args) = optParser.parse_args()

    if len( args ) != 1:
        sys.stderr.write( sys.argv[0] + ": Error: Please provide one argument (the read_file).\n" )
        sys.stderr.write( "  Call with '-h' to get usage information.\n" )
        sys.exit( 1 )

    readfilename = args[0]

    if opts.type == "sam":
        readfile = HTSeq.SAM_Reader( readfilename )
        isAlnmntFile = True
    elif opts.type == "bam":
        readfile = HTSeq.BAM_Reader( readfilename )
        isAlnmntFile = True
    elif opts.type == "solexa-export":
        readfile = HTSeq.SolexaExportReader( readfilename )
        isAlnmntFile = True
    elif opts.type == "fastq":
        readfile = HTSeq.FastqReader( readfilename )
        isAlnmntFile = False
    elif opts.type == "solexa-fastq":
        readfile = HTSeq.FastqReader( readfilename, "solexa" )
        isAlnmntFile = False
    else:
        sys.error( "Oops." )

    twoColumns = isAlnmntFile and not opts.nosplit

    if opts.outfile is None:
        outfilename = os.path.basename( readfilename ) + ".pdf"
    else:
        outfilename = opts.outfile


    # **** Get read length ****

    if opts.readlen is not None:
        readlen = opts.readlen
    else:
        readlen = 0
        if isAlnmntFile:
            reads = ( a.read for a in readfile )
        else:
            reads = readfile
        for r in islice( reads, 10000 ):
            if len( r ) > readlen:
                readlen = len( r )

    max_qual = opts.maxqual
    gamma = opts.gamma


    # **** Initialize count arrays ****

    base_arr_U = numpy.zeros( ( readlen, 5 ), numpy.int )
    qual_arr_U = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )
    if twoColumns:
        base_arr_A = numpy.zeros( ( readlen, 5 ), numpy.int )
        qual_arr_A = numpy.zeros( ( readlen, max_qual+1 ), numpy.int )


    # **** Main counting loop ****

    i = 0
    try:
        for a in readfile:
            if isAlnmntFile:
                r = a.read
        else:
            r = a
        if twoColumns and (isAlnmntFile and a.aligned):
            r.add_bases_to_count_array( base_arr_A )
            r.add_qual_to_count_array( qual_arr_A )
        else:
            r.add_bases_to_count_array( base_arr_U )
            r.add_qual_to_count_array( qual_arr_U )
        i += 1
        if (i % 200000) == 0:
            print(i, "reads processed")
    except:
        sys.stderr.write( "Error occured in: %s\n" %
            readfile.get_line_number_string() )
        raise
    print(i, "reads processed")


    # **** Normalize result ****

    def norm_by_pos( arr ):
        arr = numpy.array( arr, numpy.float )
        arr_n = ( arr.T / arr.sum( 1 ) ).T
        arr_n[ arr == 0 ] = 0
        return arr_n

    def norm_by_start( arr ):
        arr = numpy.array( arr, numpy.float )
        arr_n = ( arr.T / arr.sum( 1 )[ 0 ] ).T
        arr_n[ arr == 0 ] = 0
        return arr_n


    base_arr_U_n = norm_by_pos( base_arr_U )
    qual_arr_U_n = norm_by_start( qual_arr_U )
    nreads_U = base_arr_U[0,:].sum()
    if twoColumns:
        base_arr_A_n = norm_by_pos( base_arr_A )
        qual_arr_A_n = norm_by_start( qual_arr_A )
        nreads_A = base_arr_A[0,:].sum()


    # **** Make plot ****

    def plot_bases( arr ):
        xg = numpy.arange( readlen )
        pyplot.plot( xg, arr[ : , 0 ], marker='.', color='red')
        pyplot.plot( xg, arr[ : , 1 ], marker='.', color='darkgreen')
        pyplot.plot( xg, arr[ : , 2 ], marker='.',color='lightgreen')
        pyplot.plot( xg, arr[ : , 3 ], marker='.',color='orange')
        pyplot.plot( xg, arr[ : , 4 ], marker='.',color='grey')
        pyplot.axis( (0, readlen-1, 0, 1 ) )
        pyplot.text( readlen*.70, .9, "A", color="red" )
        pyplot.text( readlen*.75, .9, "C", color="darkgreen" )
        pyplot.text( readlen*.80, .9, "G", color="lightgreen" )
        pyplot.text( readlen*.85, .9, "T", color="orange" )
        pyplot.text( readlen*.90, .9, "N", color="grey" )

    pyplot.figure()
    pyplot.subplots_adjust( top=.85 )
    pyplot.suptitle( os.path.basename(readfilename), fontweight='bold' )

    if twoColumns:

        pyplot.subplot( 221 )
        plot_bases( base_arr_U_n )
        pyplot.ylabel( "proportion of base" )
        pyplot.title( "non-aligned reads\n%.0f%% (%.3f million)" %
        ( 100. * nreads_U / (nreads_U+nreads_A), nreads_U / 1e6 ) )

        pyplot.subplot( 222 )
        plot_bases( base_arr_A_n )
        pyplot.title( "aligned reads\n%.0f%% (%.3f million)" %
        ( 100. * nreads_A / (nreads_U+nreads_A), nreads_A / 1e6 ) )

        pyplot.subplot( 223 )
        pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )
        pyplot.ylabel( "base-call quality score" )

        pyplot.subplot( 224 )
        pyplot.pcolor( qual_arr_A_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )

    else:

        pyplot.subplot( 211 )
        plot_bases( base_arr_U_n )
        pyplot.ylabel( "proportion of base" )
        pyplot.title( "%.3f million reads" % ( nreads_U / 1e6 ) )

        pyplot.subplot( 212 )
        pyplot.pcolor( qual_arr_U_n.T ** gamma, cmap=pyplot.cm.Greens,
        norm=Normalize( 0, 1 ) )
        pyplot.axis( (0, readlen-1, 0, max_qual+1 ) )
        pyplot.xlabel( "position in read" )
        pyplot.ylabel( "base-call quality score" )


    pyplot.savefig( outfilename )
Esempio n. 2
0
def compute_quality(
        readfilename,
        file_type,
        nosplit,
        readlen,
        max_qual,
        gamma,
        primary_only=False,
        max_records=-1,
        ):

    if file_type in ("sam", "bam"):
        readfile = HTSeq.BAM_Reader(readfilename)
        isAlnmntFile = True
    elif file_type == "solexa-export":
        readfile = HTSeq.SolexaExportReader(readfilename)
        isAlnmntFile = True
    elif file_type == "fastq":
        readfile = HTSeq.FastqReader(readfilename)
        isAlnmntFile = False
    elif file_type == "solexa-fastq":
        readfile = HTSeq.FastqReader(readfilename, "solexa")
        isAlnmntFile = False
    else:
        raise ValueError('File format not recognized: {:}'.format(file_type))

    twoColumns = isAlnmntFile and (not nosplit)

    if readlen is None:
        readlen = get_read_length(readfile, isAlnmntFile)

    # Initialize count arrays
    base_arr_U = np.zeros((readlen, 5), np.int64)
    qual_arr_U = np.zeros((readlen, max_qual+1), np.int64)
    if twoColumns:
        base_arr_A = np.zeros((readlen, 5), np.int64)
        qual_arr_A = np.zeros((readlen, max_qual+1), np.int64)

    # Main counting loop
    i = 0
    try:
        for a in readfile:
            if isAlnmntFile:
                r = a.read
            else:
                r = a

            # Exclude non-primary alignments if requested
            if isAlnmntFile and primary_only:
                if a.aligned and a.not_primary_alignment:
                    continue

            if twoColumns and isAlnmntFile and a.aligned:
                r.add_bases_to_count_array(base_arr_A)
                r.add_qual_to_count_array(qual_arr_A)
            else:
                r.add_bases_to_count_array(base_arr_U)
                r.add_qual_to_count_array(qual_arr_U)

            i += 1

            if i == max_records:
                break

            if (i % 200000) == 0:
                if (not isAlnmntFile) or primary_only:
                    print(i, "reads processed")
                else:
                    print(i, "alignments processed")

    except:
        sys.stderr.write("Error occured in: %s\n" %
                         readfile.get_line_number_string())
        raise

    if (not isAlnmntFile) or primary_only:
        print(i, "reads processed")
    else:
        print(i, "alignments processed")

    # Normalize result
    def norm_by_pos(arr):
        arr = np.array(arr, np.float64)
        arr_n = (arr.T / arr.sum(1)).T
        arr_n[arr == 0] = 0
        return arr_n

    def norm_by_start(arr):
        arr = np.array(arr, np.float64)
        arr_n = (arr.T / arr.sum(1)[0]).T
        arr_n[arr == 0] = 0
        return arr_n

    result = {
        'isAlnmntFile': isAlnmntFile,
        'readlen': readlen,
        'twoColumns': twoColumns,
        'base_arr_U_n': norm_by_pos(base_arr_U),
        'qual_arr_U_n': norm_by_start(qual_arr_U),
        'nreads_U': base_arr_U[0, :].sum(),
        }

    if twoColumns:
        result['base_arr_A_n'] = norm_by_pos(base_arr_A)
        result['qual_arr_A_n'] = norm_by_start(qual_arr_A)
        result['nreads_A'] = base_arr_A[0, :].sum()

    return result