def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None): """ Split reads from paired end data For each fastq file pair in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. Arguments: matcher (BarcodeMatcher): barcoder matcher instance fastqs (list): list of Fastq pairs to split base_name (str): optional, base name to use for output Fastq files output_dir (str): optional, path to directory to write output Fastqs to """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode)) fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode)) fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name) fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name) # Filter reads nread = 0 for fq_r1, fq_r2 in fastq_pairs: print("Processing reads from fastq pair %s %s" % (fq_r1, fq_r2)) for read1, read2 in zip(FASTQFile.FastqIterator(fq_r1), FASTQFile.FastqIterator(fq_r2)): nread += 1 seq = read1.seqid.index_sequence if not seq: raise Exception("%s: no index sequence for read %d" % (fq_r1, nread)) if seq != read2.seqid.index_sequence: raise Exception("Index sequence mismatch between R1 and " "R2 reads") assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write((assigned_index, 'R1'), read1) fp.write((assigned_index, 'R2'), read2) print("Finished (%d read pairs processed)" % nread)
def count_barcodes_for_file(fastq): """Count the index sequences across a single Fastq file Arguments: fastq: Fastq file to read barcodes from Returns: 'counts' dictionary where counts[SEQ] holds the number of times index sequence SEQ occurs. """ counts = dict() nreads = 0 print "Reading in data from %s" % fastq for read in FASTQFile.FastqIterator(fastq): seq = read.seqid.index_sequence if not seq: raise ValueError,"No index sequence for read! %s" % read # Check if we've already encountered this sequence if seq in counts: # Already seen counts[seq] += 1 else: # Novel sequence counts[seq] = 1 # Return the counts dictionary return counts
def stats(fastq_file): """Generate basic stats from FASTQ file """ # Loop over all reads in the FASTQ n_reads = 0 read_lengths = {} index_sequences = {} for read in FASTQFile.FastqIterator(fastq_file): # Count of reads n_reads += 1 # Read length distribution read_len = len(read.sequence) if read_len in read_lengths: read_lengths[read_len] += 1 else: read_lengths[read_len] = 1 # Tag name distribution index_seq = read.seqid.index_sequence if index_seq is not None: if index_seq in index_sequences: index_sequences[index_seq] += 1 else: index_sequences[index_seq] = 1 # Finished print "Total reads: %d" % n_reads print "Read lengths" for len_ in read_lengths: print "\t%d: %d" % (len_, read_lengths[len_]) print "Index sequences" for seq in index_sequences: print "\t%s: %d" % (seq, index_sequences[seq])
def split_single_end(matcher, fastqs, base_name=None, output_dir=None): """Split reads from single ended data For each fastq file in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open(barcode, "%s%s.fastq" % (base_name, barcode)) fp.open('undetermined', "%sundetermined.fastq" % base_name) # Filter reads nread = 0 for fastq in fastqs: print "Processing reads from %s" % fastq for read in FASTQFile.FastqIterator(fastq): nread += 1 seq = read.seqid.index_sequence if not seq: logging.error("No index sequence for read!") sys.exit(1) assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write(assigned_index, read) print "Finished (%d reads processed)" % nread
def split_paired_end(matcher, fastq_pairs, base_name=None, output_dir=None): """Split reads from paired end data For each fastq file pair in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open((barcode, 'R1'), "%s%s_R1.fastq" % (base_name, barcode)) fp.open((barcode, 'R2'), "%s%s_R2.fastq" % (base_name, barcode)) fp.open(('undetermined', 'R1'), "%sundetermined_R1.fastq" % base_name) fp.open(('undetermined', 'R2'), "%sundetermined_R2.fastq" % base_name) # Filter reads nread = 0 for fq_r1, fq_r2 in fastq_pairs: print "Processing reads from fastq pair %s %s" % (fq_r1, fq_r2) for read1, read2 in itertools.izip(FASTQFile.FastqIterator(fq_r1), FASTQFile.FastqIterator(fq_r2)): nread += 1 seq = read1.seqid.index_sequence if not seq: logging.error("No index sequence for read!") sys.exit(1) if seq != read2.seqid.index_sequence: raise Exception, "Index sequence mismatch between R1 and R2 reads" assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write((assigned_index, 'R1'), read1) fp.write((assigned_index, 'R2'), read2) print "Finished (%d read pairs processed)" % nread
def edit_instrument_name(fastq_file, new_instrument_name): """Edit the instrument name for all records in FASTQ file Loop over all records in a supplied FASTQ file, update the sequence identifier (i.e. first line in the each record) by changing the instrument name, and write the updated records to stdout. """ # Loop over all reads in the FASTQ # Update the instrument name in the sequence identifier and echo to stdout for read in FASTQFile.FastqIterator(fastq_file): if new_instrument_name: # Modify the instrument name read.seqid.instrument_name = new_instrument_name # Echo updated read to stdout print read
def load(self, fastq=None, fp=None): """Read in fastq data and collect index sequence info The input FASTQ can be either a text file or a compressed (gzipped) FASTQ, specified via a file name (using the 'fastq' argument), or a file-like object opened for line reading (using the 'fp' argument). Arguments: fastq_file: name of the FASTQ file to iterate through fp: file-like object opened for reading """ for read in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): seq = read.seqid.index_sequence if seq not in self._counts: self._counts[seq] = 1 else: self._counts[seq] += 1
def fastqiterator(fastq=None, fp=None): """ Return number of reads in a FASTQ file Uses the FASTQFile.FastqIterator class to do the counting. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Number of reads """ nreads = 0 for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): nreads += 1 return nreads
def split_single_end(matcher, fastqs, base_name=None, output_dir=None): """ Split reads from single ended data For each fastq file in 'fastqs', check reads against the index sequences in the BarcodeMatcher 'matcher' and write to an appropriate file. Arguments: matcher (BarcodeMatcher): barcoder matcher instance fastqs (list): list of Fastqs to split base_name (str): optional, base name to use for output Fastq files output_dir (str): optional, path to directory to write output Fastqs to """ if base_name is None: base_name = '' else: base_name = "%s." % base_name fp = OutputFiles(base_dir=output_dir) for barcode in matcher.sequences: fp.open(barcode, "%s%s.fastq" % (base_name, barcode)) fp.open('undetermined', "%sundetermined.fastq" % base_name) # Filter reads nread = 0 for fastq in fastqs: print("Processing reads from %s" % fastq) for read in FASTQFile.FastqIterator(fastq): nread += 1 seq = read.seqid.index_sequence if not seq: raise Exception("%s: no index sequence for read %d" % (fastq, nread)) assigned_index = matcher.match(seq) # Read not assigned if assigned_index is None: assigned_index = 'undetermined' logging.debug("Assigned read #%d to %s" % (nread, assigned_index)) fp.write(assigned_index, read) print("Finished (%d reads processed)" % nread)
def reads_per_lane(fastq=None, fp=None): """ Return counts of reads in each lane of FASTQ file Uses the FASTQFile.FastqIterator class to do the counting, with counts split by lane. Arguments: fastq: fastq(.gz) file fp: open file descriptor for fastq file Returns: Dictionary where keys are lane numbers (as integers) and values are number of reads in that lane. """ nreads = {} for r in FASTQFile.FastqIterator(fastq_file=fastq, fp=fp): lane = int(r.seqid.flowcell_lane) try: nreads[lane] += 1 except KeyError: nreads[lane] = 1 return nreads
# Main program if __name__ == "__main__": # Collect input fastq file name if len(sys.argv) < 2: print("Usage: %s fastq" % os.path.basename(sys.argv[0])) sys.exit() fastq = sys.argv[1] # Output file names fastq_out = fastq + ".paired" singles_header = fastq + ".single.header" pairs_header = fastq + ".pair.header" # Loop over file and collect read names headers = set() pairs = set() n = 1 for read in FASTQFile.FastqIterator(fastq): seqid = str(read.seqid) if seqid in headers: # Part of a pair pairs.add(seqid) else: headers.add(seqid) n += 1 if not (n % 1000000): print("%s" % n) # Loop again outputing only paired reads fp = io.open(fastq_out, 'wt') fp_singles = io.open(singles_header, 'wt') fp_pairs = io.open(pairs_header, 'wt') n = 1 for read in FASTQFile.FastqIterator(fastq): seqid = str(read.seqid)
def demultiplex_fastq(fastq_file,barcodes,nmismatches): """Perform demultiplexing of a FASTQ file Demultiplex reads in a FASTQ file given information about a set of barcode/index sequences. Produces a file for each barcode, plus another for 'unbinned' reads. Arguments: fastq_file: FASTQ file to be demultiplexed (can be gzipped) barcodes: list of barcode sequences to use for demultiplexing nmismatches: maxiumum number of mismatched bases allowed when testing whether barcode sequences match Returns: No return value """ # Start print "Processing %s" % fastq_file info = IlluminaData.IlluminaFastq(fastq_file) # Set up output files output_files = {} # Weed out barcodes that aren't associated with this lane local_barcodes = [] for barcode in barcodes: if barcode['lane'] != info.lane_number: continue local_barcodes.append(barcode) output_file_name = "%s_%s_L%03d_R%d_%03d.fastq" % (barcode['name'], barcode['index'], info.lane_number, info.read_number, info.set_number) print "\t%s\t%s" % (barcode['index'],output_file_name) if os.path.exists(output_file_name): print "\t%s: already exists,exiting" % output_file_name sys.exit(1) output_files[barcode['index']] = open(output_file_name,'w') # Check if there's anything to do if len(local_barcodes) == 0: return # Also make a file for unbinned reads unbinned_file_name = "unbinned_L%03d_R%d_%03d.fastq" % (info.lane_number, info.read_number, info.set_number) if os.path.exists(unbinned_file_name): print "\t%s: already exists,exiting" % unbinned_file_name sys.exit(1) output_files['unbinned'] = open(unbinned_file_name,'w') # Process reads nreads = 0 for read in FASTQFile.FastqIterator(fastq_file): nreads += 1 matched_read = False this_barcode = read.seqid.index_sequence for barcode in local_barcodes: if barcode['matcher'].match(this_barcode,nmismatches): ##print "Matched %s against %s" % (this_barcode,barcodes[barcode]['name']) output_files[barcode['index']].write(str(read)+'\n') matched_read = True break # Put in unbinned if no match if not matched_read: output_files['unbinned'].write(str(read)+'\n') ##if nreads > 100: break # Close files for barcode in local_barcodes: output_files[barcode['index']].close() print "\tMatched %d reads for %s" % (nreads,os.path.basename(fastq_file))