コード例 #1
def stats(fastq_file):
    """Generate basic stats from FASTQ file
    # Loop over all reads in the FASTQ
    n_reads = 0
    read_lengths = {}
    index_sequences = {}
    for read in FASTQFile.FastqIterator(fastq_file):
        # Count of reads
        n_reads += 1
        # Read length distribution
        read_len = len(read.sequence)
        if read_len in read_lengths:
            read_lengths[read_len] += 1
            read_lengths[read_len] = 1
        # Tag name distribution
        index_seq = read.seqid.index_sequence
        if index_seq is not None:
            if index_seq in index_sequences:
                index_sequences[index_seq] += 1
                index_sequences[index_seq] = 1
    # Finished
    print "Total reads: %d" % n_reads
    print "Read lengths"
    for len_ in read_lengths:
        print "\t%d: %d" % (len_, read_lengths[len_])
    print "Index sequences"
    for seq in index_sequences:
        print "\t%s: %d" % (seq, index_sequences[seq])
コード例 #2
ファイル: fastq_edit.py プロジェクト: mamanambiya/genomics
def edit_instrument_name(fastq_file,new_instrument_name):
    """Edit the instrument name for all records in FASTQ file

    Loop over all records in a supplied FASTQ file, update the sequence identifier
    (i.e. first line in the each record) by changing the instrument name, and write
    the updated records to stdout.
    # Loop over all reads in the FASTQ
    # Update the instrument name in the sequence identifier and echo to stdout
    for read in FASTQFile.FastqIterator(fastq_file):
        if new_instrument_name:
            # Modify the instrument name
            read.seqid.instrument_name = new_instrument_name
        # Echo updated read to stdout
        print read
コード例 #3
        "N_SUBSET reads. (Quicker than using all reads but may not be accurate "
        "if subset is not representative of the file as a whole.)")

    # Process the command line
    options, arguments = p.parse_args()
    if len(arguments) != 1:
        p.error("input FASTQ file required")
        fastq_file = arguments[0]
        if not os.path.exists(fastq_file):
            p.error("Input file '%s' not found" % fastq_file)

    # Get broad format type
    print "Sniffing %s" % fastq_file
    print "\nData from first read:"
    for read in FASTQFile.FastqIterator(fastq_file):
        fastq_format = read.seqid.format
        if fastq_format is None and read.is_colorspace:
            fastq_format = 'colorspace'
        print "\tHeader format:\t%s" % str(fastq_format)
        print "\tSeq length:\t%d" % read.seqlen

    # Determine the quality score range (and count reads)
        n_subset = int(options.n_subset)
    except TypeError:
        n_subset = None
    n_reads = 0
    min_max_qual = (None, None)
    for read in FASTQFile.FastqIterator(fastq_file):
コード例 #4
def demultiplex_fastq(fastq_file, barcodes, nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'

      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

      No return value
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (
            barcode['name'], barcode['index'], info.lane_number,
            info.read_number, info.set_number)
        print "\t%s\t%s" % (barcode['index'], output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
        output_files[barcode['index']] = open(output_file_name, 'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (
        info.lane_number, info.read_number, info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
    output_files['unbinned'] = open(unbinned_file_name, 'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode, nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read) + '\n')
                matched_read = True
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read) + '\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
    print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))