def stats(fastq_file): """Generate basic stats from FASTQ file """ # Loop over all reads in the FASTQ n_reads = 0 read_lengths = {} index_sequences = {} for read in FASTQFile.FastqIterator(fastq_file): # Count of reads n_reads += 1 # Read length distribution read_len = len(read.sequence) if read_len in read_lengths: read_lengths[read_len] += 1 else: read_lengths[read_len] = 1 # Tag name distribution index_seq = read.seqid.index_sequence if index_seq is not None: if index_seq in index_sequences: index_sequences[index_seq] += 1 else: index_sequences[index_seq] = 1 # Finished print "Total reads: %d" % n_reads print "Read lengths" for len_ in read_lengths: print "\t%d: %d" % (len_, read_lengths[len_]) print "Index sequences" for seq in index_sequences: print "\t%s: %d" % (seq, index_sequences[seq])
def edit_instrument_name(fastq_file,new_instrument_name): """Edit the instrument name for all records in FASTQ file Loop over all records in a supplied FASTQ file, update the sequence identifier (i.e. first line in the each record) by changing the instrument name, and write the updated records to stdout. """ # Loop over all reads in the FASTQ # Update the instrument name in the sequence identifier and echo to stdout for read in FASTQFile.FastqIterator(fastq_file): if new_instrument_name: # Modify the instrument name read.seqid.instrument_name = new_instrument_name # Echo updated read to stdout print read
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % ( fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn, fastq)
"N_SUBSET reads. (Quicker than using all reads but may not be accurate " "if subset is not representative of the file as a whole.)") # Process the command line options, arguments = p.parse_args() if len(arguments) != 1: p.error("input FASTQ file required") else: fastq_file = arguments[0] if not os.path.exists(fastq_file): p.error("Input file '%s' not found" % fastq_file) # Get broad format type print "Sniffing %s" % fastq_file print "\nData from first read:" for read in FASTQFile.FastqIterator(fastq_file): fastq_format = read.seqid.format if fastq_format is None and read.is_colorspace: fastq_format = 'colorspace' print "\tHeader format:\t%s" % str(fastq_format) print "\tSeq length:\t%d" % read.seqlen break # Determine the quality score range (and count reads) try: n_subset = int(options.n_subset) except TypeError: n_subset = None n_reads = 0 min_max_qual = (None, None) for read in FASTQFile.FastqIterator(fastq_file):
####################################################################### # Main program ####################################################################### if __name__ == "__main__": # Create command line parser p = optparse.OptionParser(usage="%prog OPTIONS R1.fastq R2.fastq", version="%prog "+__version__, description="Check that read headers for R1 and R2 fastq files " "are in agreement, and that the files form an R1/2 pair.") # Parse command line options,args = p.parse_args() # Get data directory name if len(args) != 2: p.error("expected two arguments (R1 and R2 fastq files to compare)") fastq_file_r1 = args[0] fastq_file_r2 = args[1] # Process the data if FASTQFile.fastqs_are_pair(fastq_file_r1,fastq_file_r2): sys.exit(0) else: logging.error("Not R1/R2 pair") sys.exit(1)
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq:
def demultiplex_fastq(fastq_file, barcodes, nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % ( barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'], output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name, 'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % ( info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name, 'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode, nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read) + '\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read) + '\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads, os.path.basename(fastq_file))
os.path.normpath(os.path.join(os.path.dirname(sys.argv[0]), '..', 'share'))) sys.path.append(SHARE_DIR) import FASTQFile ####################################################################### # Main program ####################################################################### if __name__ == "__main__": # Create command line parser p = optparse.OptionParser( usage="%prog OPTIONS R1.fastq R2.fastq", version="%prog " + __version__, description="Check that read headers for R1 and R2 fastq files " "are in agreement, and that the files form an R1/2 pair.") # Parse command line options, args = p.parse_args() # Get data directory name if len(args) != 2: p.error("expected two arguments (R1 and R2 fastq files to compare)") fastq_file_r1 = args[0] fastq_file_r2 = args[1] # Process the data if FASTQFile.fastqs_are_pair(fastq_file_r1, fastq_file_r2): sys.exit(0) else: logging.error("Not R1/R2 pair") sys.exit(1)