Ejemplo n.º 1
0
def stats(fastq_file):
    """Generate basic stats from FASTQ file
    """
    # Loop over all reads in the FASTQ
    n_reads = 0
    read_lengths = {}
    index_sequences = {}
    for read in FASTQFile.FastqIterator(fastq_file):
        # Count of reads
        n_reads += 1
        # Read length distribution
        read_len = len(read.sequence)
        if read_len in read_lengths:
            read_lengths[read_len] += 1
        else:
            read_lengths[read_len] = 1
        # Tag name distribution
        index_seq = read.seqid.index_sequence
        if index_seq is not None:
            if index_seq in index_sequences:
                index_sequences[index_seq] += 1
            else:
                index_sequences[index_seq] = 1
    # Finished
    print "Total reads: %d" % n_reads
    print "Read lengths"
    for len_ in read_lengths:
        print "\t%d: %d" % (len_, read_lengths[len_])
    print "Index sequences"
    for seq in index_sequences:
        print "\t%s: %d" % (seq, index_sequences[seq])
Ejemplo n.º 2
0
def edit_instrument_name(fastq_file,new_instrument_name):
    """Edit the instrument name for all records in FASTQ file

    Loop over all records in a supplied FASTQ file, update the sequence identifier
    (i.e. first line in the each record) by changing the instrument name, and write
    the updated records to stdout.
    """
    # Loop over all reads in the FASTQ
    # Update the instrument name in the sequence identifier and echo to stdout
    for read in FASTQFile.FastqIterator(fastq_file):
        if new_instrument_name:
            # Modify the instrument name
            read.seqid.instrument_name = new_instrument_name
        # Echo updated read to stdout
        print read
Ejemplo n.º 3
0
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn, fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (
                            fastq, bcf_utils.format_file_size(fsize), nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn, fastq)
Ejemplo n.º 4
0
        "N_SUBSET reads. (Quicker than using all reads but may not be accurate "
        "if subset is not representative of the file as a whole.)")

    # Process the command line
    options, arguments = p.parse_args()
    if len(arguments) != 1:
        p.error("input FASTQ file required")
    else:
        fastq_file = arguments[0]
        if not os.path.exists(fastq_file):
            p.error("Input file '%s' not found" % fastq_file)

    # Get broad format type
    print "Sniffing %s" % fastq_file
    print "\nData from first read:"
    for read in FASTQFile.FastqIterator(fastq_file):
        fastq_format = read.seqid.format
        if fastq_format is None and read.is_colorspace:
            fastq_format = 'colorspace'
        print "\tHeader format:\t%s" % str(fastq_format)
        print "\tSeq length:\t%d" % read.seqlen
        break

    # Determine the quality score range (and count reads)
    try:
        n_subset = int(options.n_subset)
    except TypeError:
        n_subset = None
    n_reads = 0
    min_max_qual = (None, None)
    for read in FASTQFile.FastqIterator(fastq_file):
Ejemplo n.º 5
0
#######################################################################
# Main program
#######################################################################

if __name__ == "__main__":
    
    # Create command line parser
    p = optparse.OptionParser(usage="%prog OPTIONS R1.fastq R2.fastq",
                              version="%prog "+__version__,
                              description="Check that read headers for R1 and R2 fastq files "
                              "are in agreement, and that the files form an R1/2 pair.")
    # Parse command line
    options,args = p.parse_args()
    # Get data directory name
    if len(args) != 2:
        p.error("expected two arguments (R1 and R2 fastq files to compare)")
    fastq_file_r1 = args[0]
    fastq_file_r2 = args[1]
    # Process the data
    if FASTQFile.fastqs_are_pair(fastq_file_r1,fastq_file_r2):
        sys.exit(0)
    else:
        logging.error("Not R1/R2 pair")
        sys.exit(1)
        
        
        
            
        
    
Ejemplo n.º 6
0
                fastqs = sample.fastq_subset(read_number=1) + \
                         sample.fastq_subset(read_number=2)
                for fastq in fastqs:
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
Ejemplo n.º 7
0
def demultiplex_fastq(fastq_file, barcodes, nmismatches):
    """Perform demultiplexing of a FASTQ file

    Demultiplex reads in a FASTQ file given information about a set of 
    barcode/index sequences.

    Produces a file for each barcode, plus another for 'unbinned'
    reads.

    Arguments:
      fastq_file: FASTQ file to be demultiplexed (can be gzipped)
      barcodes: list of barcode sequences to use for demultiplexing
      nmismatches: maxiumum number of mismatched bases allowed when
        testing whether barcode sequences match

    Returns:
      No return value
    """
    # Start
    print "Processing %s" % fastq_file
    info = IlluminaData.IlluminaFastq(fastq_file)
    # Set up output files
    output_files = {}
    # Weed out barcodes that aren't associated with this lane
    local_barcodes = []
    for barcode in barcodes:
        if barcode['lane'] != info.lane_number:
            continue
        local_barcodes.append(barcode)
        output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (
            barcode['name'], barcode['index'], info.lane_number,
            info.read_number, info.set_number)
        print "\t%s\t%s" % (barcode['index'], output_file_name)
        if os.path.exists(output_file_name):
            print "\t%s: already exists,exiting" % output_file_name
            sys.exit(1)
        output_files[barcode['index']] = open(output_file_name, 'w')
    # Check if there's anything to do
    if len(local_barcodes) == 0:
        return
    # Also make a file for unbinned reads
    unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (
        info.lane_number, info.read_number, info.set_number)
    if os.path.exists(unbinned_file_name):
        print "\t%s: already exists,exiting" % unbinned_file_name
        sys.exit(1)
    output_files['unbinned'] = open(unbinned_file_name, 'w')
    # Process reads
    nreads = 0
    for read in FASTQFile.FastqIterator(fastq_file):
        nreads += 1
        matched_read = False
        this_barcode = read.seqid.index_sequence
        for barcode in local_barcodes:
            if barcode['matcher'].match(this_barcode, nmismatches):
                ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name'])
                output_files[barcode['index']].write(str(read) + '\n')
                matched_read = True
                break
        # Put in unbinned if no match
        if not matched_read:
            output_files['unbinned'].write(str(read) + '\n')
        ##if nreads > 100: break
    # Close files
    for barcode in local_barcodes:
        output_files[barcode['index']].close()
    print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))
Ejemplo n.º 8
0
    os.path.normpath(os.path.join(os.path.dirname(sys.argv[0]), '..',
                                  'share')))
sys.path.append(SHARE_DIR)
import FASTQFile

#######################################################################
# Main program
#######################################################################

if __name__ == "__main__":

    # Create command line parser
    p = optparse.OptionParser(
        usage="%prog OPTIONS R1.fastq R2.fastq",
        version="%prog " + __version__,
        description="Check that read headers for R1 and R2 fastq files "
        "are in agreement, and that the files form an R1/2 pair.")
    # Parse command line
    options, args = p.parse_args()
    # Get data directory name
    if len(args) != 2:
        p.error("expected two arguments (R1 and R2 fastq files to compare)")
    fastq_file_r1 = args[0]
    fastq_file_r2 = args[1]
    # Process the data
    if FASTQFile.fastqs_are_pair(fastq_file_r1, fastq_file_r2):
        sys.exit(0)
    else:
        logging.error("Not R1/R2 pair")
        sys.exit(1)