def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None): """ Split reads from paired end data For each fastq file pair in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. Arguments: matcher (BarcodeMatcher): barcoder matcher instance fastqs (list): list of Fastq pairs to split base_name (str): optional, base name to use for output Fastq files output_dir (str): optional, path to directory to write output Fastqs to """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode)) fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode)) fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name) fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name) # Filter reads nread = 0 for fq_r1, fq_r2 in fastq_pairs: print("Processing reads from fastq pair %s %s" % (fq_r1, fq_r2)) for read1, read2 in zip(FASTQFile.FastqIterator(fq_r1), FASTQFile.FastqIterator(fq_r2)): nread += 1 seq = read1.seqid.index_sequence if not seq: raise Exception("%s: no index sequence for read %d" % (fq_r1, nread)) if seq != read2.seqid.index_sequence: raise Exception("Index sequence mismatch between R1 and " "R2 reads") assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write((assigned_index, 'R1'), read1) fp.write((assigned_index, 'R2'), read2) print("Finished (%d read pairs processed)" % nread)
def stats(fastq_file): """Generate basic stats from FASTQ file """ # Loop over all reads in the FASTQ n_reads = 0 read_lengths = {} index_sequences = {} for read in FASTQFile.FastqIterator(fastq_file): # Count of reads n_reads += 1 # Read length distribution read_len = len(read.sequence) if read_len in read_lengths: read_lengths[read_len] += 1 else: read_lengths[read_len] = 1 # Tag name distribution index_seq = read.seqid.index_sequence if index_seq is not None: if index_seq in index_sequences: index_sequences[index_seq] += 1 else: index_sequences[index_seq] = 1 # Finished print "Total reads: %d" % n_reads print "Read lengths" for len_ in read_lengths: print "\t%d: %d" % (len_, read_lengths[len_]) print "Index sequences" for seq in index_sequences: print "\t%s: %d" % (seq, index_sequences[seq])
def split_single_end(matcher, fastqs, base_name=None, output_dir=None): """Split reads from single ended data For each fastq file in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open(barcode, "%s%s.fastq" % (base_name, barcode)) fp.open('undetermined', "%sundetermined.fastq" % base_name) # Filter reads nread = 0 for fastq in fastqs: print "Processing reads from %s" % fastq for read in FASTQFile.FastqIterator(fastq): nread += 1 seq = read.seqid.index_sequence if not seq: logging.error("No index sequence for read!") sys.exit(1) assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write(assigned_index, read) print "Finished (%d reads processed)" % nread
def count_barcodes_for_file(fastq): """Count the index sequences across a single Fastq file Arguments: fastq: Fastq file to read barcodes from Returns: 'counts' dictionary where counts[SEQ] holds the number of times index sequence SEQ occurs. """ counts = dict() nreads = 0 print "Reading in data from %s" % fastq for read in FASTQFile.FastqIterator(fastq): seq = read.seqid.index_sequence if not seq: raise ValueError,"No index sequence for read! %s" % read # Check if we've already encountered this sequence if seq in counts: # Already seen counts[seq] += 1 else: # Novel sequence counts[seq] = 1 # Return the counts dictionary return counts
def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None): """Split reads from paired end data For each fastq file pair in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode)) fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode)) fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name) fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name) # Filter reads nread = 0 for fq_r1, fq_r2 in fastq_pairs: print "Processing reads from fastq pair %s %s" % (fq_r1, fq_r2) for read1, read2 in itertools.izip(FASTQFile.FastqIterator(fq_r1), FASTQFile.FastqIterator(fq_r2)): nread += 1 seq = read1.seqid.index_sequence if not seq: logging.error("No index sequence for read!") sys.exit(1) if seq != read2.seqid.index_sequence: raise Exception, "Index sequence mismatch between R1 and R2 reads" assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write((assigned_index, 'R1'), read1) fp.write((assigned_index, 'R2'), read2) print "Finished (%d read pairs processed)" % nread
def edit_instrument_name(fastq_file, new_instrument_name): """Edit the instrument name for all records in FASTQ file Loop over all records in a supplied FASTQ file, update the sequence identifier (i.e. first line in the each record) by changing the instrument name, and write the updated records to stdout. """ # Loop over all reads in the FASTQ # Update the instrument name in the sequence identifier and echo to stdout for read in FASTQFile.FastqIterator(fastq_file): if new_instrument_name: # Modify the instrument name read.seqid.instrument_name = new_instrument_name # Echo updated read to stdout print read
def simple(fastq=None, fp=None): """ Return number of reads in a FASTQ file Uses the FASTQFile.nreads function to do the counting. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Number of reads """ return FASTQFile.nreads(fastq=fastq, fp=fp)
def load(self, fastq=None, fp=None): """Read in fastq data and collect index sequence info The input FASTQ can be either a text file or a compressed (gzipped) FASTQ, specified via a file name (using the 'fastq' argument), or a file-like object opened for line reading (using the 'fp' argument). Arguments: fastq_file: name of the FASTQ file to iterate through fp: file-like object opened for reading """ for read in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): seq = read.seqid.index_sequence if seq not in self._counts: self._counts[seq] = 1 else: self._counts[seq] += 1
def fastqiterator(fastq=None, fp=None): """ Return number of reads in a FASTQ file Uses the FASTQFile.FastqIterator class to do the counting. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Number of reads """ nreads = 0 for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): nreads += 1 return nreads
def split_single_end(matcher, fastqs, base_name=None, output_dir=None): """ Split reads from single ended data For each fastq file in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. Arguments: matcher (BarcodeMatcher): barcoder matcher instance fastqs (list): list of Fastqs to split base_name (str): optional, base name to use for output Fastq files output_dir (str): optional, path to directory to write output Fastqs to """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open(barcode, "%s%s.fastq" % (base_name, barcode)) fp.open('undetermined', "%sundetermined.fastq" % base_name) # Filter reads nread = 0 for fastq in fastqs: print("Processing reads from %s" % fastq) for read in FASTQFile.FastqIterator(fastq): nread += 1 seq = read.seqid.index_sequence if not seq: raise Exception("%s: no index sequence for read %d" % (fastq, nread)) assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write(assigned_index, read) print("Finished (%d reads processed)" % nread)
def reads_per_lane(fastq=None, fp=None): """ Return counts of reads in each lane of FASTQ file Uses the FASTQFile.FastqIterator class to do the counting, with counts split by lane. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Dictionary where keys are lane numbers (as integers) and values are number of reads in that lane. """ nreads = {} for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): lane = int(r.seqid.flowcell_lane) try: nreads[lane] += 1 except KeyError: nreads[lane] = 1 return nreads
####################################################################### # Main program ####################################################################### if __name__ == "__main__": # Create command line parser p = optparse.OptionParser(usage="%prog OPTIONS R1.fastq R2.fastq", version="%prog "+__version__, description="Check that read headers for R1 and R2 fastq files " "are in agreement, and that the files form an R1/2 pair.") # Parse command line options,args = p.parse_args() # Get data directory name if len(args) != 2: p.error("expected two arguments (R1 and R2 fastq files to compare)") fastq_file_r1 = args[0] fastq_file_r2 = args[1] # Process the data if FASTQFile.fastqs_are_pair(fastq_file_r1,fastq_file_r2): sys.exit(0) else: logging.error("Not R1/R2 pair") sys.exit(1)
def demultiplex_fastq(fastq_file,barcodes,nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'],output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name,'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name,'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode,nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read)+'\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read)+'\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))
####################################################################### # Main program ####################################################################### if __name__ == "__main__": # Create command line parser p = argparse.ArgumentParser( version="%(prog)s "+__version__, description="Check that read headers for R1 and R2 fastq files " "are in agreement, and that the files form an R1/2 pair.") p.add_argument('fastq_file_r1',metavar="R1.fastq", help="Fastq file with R1 reads") p.add_argument('fastq_file_r2',metavar="R2.fastq", help="Fastq file with R2 reads to check against " "R1 reads") # Parse command line args = p.parse_args() # Process the data if FASTQFile.fastqs_are_pair(args.fastq_file_r1,args.fastq_file_r2): sys.exit(0) else: logging.error("Not R1/R2 pair") sys.exit(1)
def batch_fastqs(fastqs, nbatches, basename="batched", out_dir=None): """ Splits reads from one or more Fastqs into batches Concatenates input Fastq files and then splits reads into smaller Fastqs using the external 'batch' utility. Arguments: fastqs (list): list of paths to one or more Fastq files to take reads from nbatches (int): number of batches to output reads into basename (str): optional basename to use for the output Fastq files (default: 'batched') out_dir (str): optional path to a directory where the batched Fastqs will be written """ # Count the total number of reads print "Fetching read counts:" nreads = 0 for fq in fastqs: n = FASTQFile.nreads(fq) print "%s:\t%d" % (os.path.basename(fq), n) nreads += n print "Total reads: %d" % nreads # Determine batch size batch_size = nreads / nbatches if nreads % batch_size: # Round up batch size batch_size += 1 assert (batch_size * nbatches >= nreads) print "Creating batches of %d reads" % batch_size # Check if fastqs are compressed gzipped = fastqs[0].endswith('.gz') if gzipped: batch_cmd = Command('zcat') else: batch_cmd = Command('cat') # Get the read number read_number = get_read_number(fastqs[0]) suffix = ".r%s.fastq" % read_number # Build and run the batching command batch_cmd.add_args(*fastqs) batch_cmd.add_args('|', 'split', '-l', batch_size * 4, '-d', '-a', 3, '--additional-suffix=%s' % suffix, '-', os.path.join(out_dir, "%s.B" % basename)) batch_script = os.path.join(out_dir, "batch.sh") batch_cmd.make_wrapper_script("/bin/bash", batch_script) # Check for successful exit code retcode = Command("/bin/bash", batch_script).run_subprocess(working_dir=out_dir) if retcode != 0: raise Exception("Batching failed: exit code %s" % retcode) # Collect and return the batched Fastq names batched_fastqs = [ os.path.join(out_dir, "%s.B%03d%s" % (basename, i, suffix)) for i in xrange(0, nbatches) ] return batched_fastqs
# Main program if __name__ == "__main__": # Collect input fastq file name if len(sys.argv) < 2: print("Usage: %s fastq" % os.path.basename(sys.argv[0])) sys.exit() fastq = sys.argv[1] # Output file names fastq_out = fastq + ".paired" singles_header = fastq + ".single.header" pairs_header = fastq + ".pair.header" # Loop over file and collect read names headers = set() pairs = set() n = 1 for read in FASTQFile.FastqIterator(fastq): seqid = str(read.seqid) if seqid in headers: # Part of a pair pairs.add(seqid) else: headers.add(seqid) n += 1 if not (n % 1000000): print("%s" % n) # Loop again outputing only paired reads fp = io.open(fastq_out, 'wt') fp_singles = io.open(singles_header, 'wt') fp_pairs = io.open(pairs_header, 'wt') n = 1 for read in FASTQFile.FastqIterator(fastq): seqid = str(read.seqid)
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % ( fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn, fastq)
SHARE_DIR = os.path.abspath( os.path.normpath(os.path.join(os.path.dirname(sys.argv[0]), '..'))) sys.path.append(SHARE_DIR) import bcftbx.FASTQFile as FASTQFile ####################################################################### # Main program ####################################################################### if __name__ == "__main__": # Create command line parser p = optparse.OptionParser( usage="%prog OPTIONS R1.fastq R2.fastq", version="%prog " + __version__, description="Check that read headers for R1 and R2 fastq files " "are in agreement, and that the files form an R1/2 pair.") # Parse command line options, args = p.parse_args() # Get data directory name if len(args) != 2: p.error("expected two arguments (R1 and R2 fastq files to compare)") fastq_file_r1 = args[0] fastq_file_r2 = args[1] # Process the data if FASTQFile.fastqs_are_pair(fastq_file_r1, fastq_file_r2): sys.exit(0) else: logging.error("Not R1/R2 pair") sys.exit(1)
fastqs = sample.fastq_subset(read_number=1) + \ sample.fastq_subset(read_number=2) for fastq in fastqs: print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: