def report_basic_stats(self,out_file=None,fp=None): """ Report the 'basic' statistics For each FASTQ file, report the following information: - Project name - Sample name - FASTQ file name (without leading directory) - Size (human-readable) - Nreads (number of reads) - Paired_end ('Y' for paired-end, 'N' for single-end) Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file,'w') else: fpp = fp # Report stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) for line in self._stats: data = [line[c] for c in stats.header()] stats.append(data=data) stats.write(fp=fpp,include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
class FastqStatistics: """ Class for collecting and reporting stats on Illumina FASTQs Given a directory with fastq(.gz) files arranged in the same structure as the output from bcl2fastq or bcl2fastq2, collects statistics for each file and provides methods for reporting different aspects. Example usage: >>> from IlluminaData import IlluminaData >>> data = IlluminaData('120117_BLAH_JSHJHXXX','bcl2fastq') >>> stats = FastqStatistics(data) >>> stats.report_basic_stats('basic_stats.out') """ def __init__(self, illumina_data, n_processors=1, add_to=None): """ Create a new FastqStatistics instance Arguments: illumina_data: populated IlluminaData object describing the run. n_processors: number of processors to use (if >1 then uses the multiprocessing library to run the statistics gathering using multiple cores). add_to: optional, add the data to that from an existing statistics file """ self._illumina_data = illumina_data self._n_processors = n_processors self._stats = None self._lane_names = [] self._get_data(filen=add_to) def _get_data(self, filen=None): """ Collect statistics for FASTQ outputs from an Illumina run """ # Collect FASTQ files fastqstats = [] for project in self._illumina_data.projects: for sample in project.samples: for fastq in sample.fastq: fastqstats.append( FastqStats(os.path.join(sample.dirn, fastq), project.name, sample.name)) # Gather same information for undetermined reads (if present) if self._illumina_data.undetermined is not None: for lane in self._illumina_data.undetermined.samples: for fastq in lane.fastq: fastqstats.append( FastqStats(os.path.join(lane.dirn, fastq), self._illumina_data.undetermined.name, lane.name)) # Collect the data for each file if self._n_processors > 1: # Multiple cores pool = Pool(self._n_processors) results = pool.map(collect_fastq_data, fastqstats) pool.close() pool.join() else: # Single core results = map(collect_fastq_data, fastqstats) # Set up tabfile to hold pre-existing data if filen is not None: existing_stats = TabFile(filen, first_line_is_header=True) else: existing_stats = None # Set up class to hold all collected data self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end', 'Read_number')) # Split result sets into R1 and R2 results_r1 = filter(lambda f: f.read_number == 1, results) results_r2 = filter(lambda f: f.read_number == 2, results) # Determine which lanes are present and append # columns for each lanes = set() for fastq in results_r1: logger.debug("-- %s: lanes %s" % (fastq.name, ','.join([str(l) for l in fastq.lanes]))) for lane in fastq.lanes: lanes.add(lane) # Add lane numbers from pre-existing stats file if existing_stats is not None: for c in existing_stats.header(): if c.startswith('L'): lanes.add(int(c[1:])) self._lanes = sorted(list(lanes)) logger.debug("Lanes found: %s" % ','.join([str(l) for l in self._lanes])) for lane in self._lanes: self._stats.appendColumn("L%s" % lane) # Copy pre-existing stats into new tabfile if existing_stats: for line in existing_stats: data = [ line['Project'], line['Sample'], line['Fastq'], line['Size'], line['Nreads'], line['Paired_end'], line['Read_number'] ] for lane in lanes: try: data.append(line["L%s" % lane]) except: data.append('') self._stats.append(data=data) # Copy reads per lane from R1 FASTQs into R2 for r2_fastq in results_r2: # Get corresponding R1 name logger.debug("-- Fastq R2: %s" % r2_fastq.name) r1_fastq_name = IlluminaFastq(r2_fastq.name) r1_fastq_name.read_number = 1 r1_fastq_name = str(r1_fastq_name) logger.debug("-- -> R1: %s" % r1_fastq_name) # Locate corresponding data r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name), results_r1)[0] r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane) # Write the data into the tabfile paired_end = ('Y' if self._illumina_data.paired_end else 'N') for fastq in results: # Check for existing entry existing_entry = False for line in self._stats: if (line['Project'] == fastq.project and line['Sample'] == fastq.sample and line['Fastq'] == fastq.name): # Overwrite the existing entry existing_entry = True break # Write the data if not existing_entry: # Append new entry data = [ fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, paired_end, fastq.read_number ] for lane in lanes: try: data.append(fastq.reads_by_lane[lane]) except: data.append('') self._stats.append(data=data) else: # Overwrite existing entry logging.warning("Overwriting exisiting entry for " "%s/%s/%s" % (fastq.project, fastq.sample, fastq.name)) line['Size'] = bcf_utils.format_file_size(fastq.fsize) line['Nreads'] = fastq.nreads line['Paired_end'] = paired_end line['Read_number'] = fastq.read_number for lane in lanes: lane_name = "L%d" % lane try: line[lane_name] = fastq.reads_by_lane[lane] except: line[lane_name] = '' @property def lane_names(self): """ Return list of lane names (e.g. ['L1','L2',...]) """ return [("L%d" % l) for l in self._lanes] @property def raw(self): """ Return the 'raw' statistics TabFile instance """ return self._stats def report_full_stats(self, out_file=None, fp=None): """ Report all statistics gathered for all FASTQs Essentially a dump of all the data. Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report self._stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close() def report_basic_stats(self, out_file=None, fp=None): """ Report the 'basic' statistics For each FASTQ file, report the following information: - Project name - Sample name - FASTQ file name (without leading directory) - Size (human-readable) - Nreads (number of reads) - Paired_end ('Y' for paired-end, 'N' for single-end) Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) for line in self._stats: data = [line[c] for c in stats.header()] stats.append(data=data) stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close() def report_per_lane_sample_stats(self, out_file=None, fp=None): """ Report of reads per sample in each lane Reports the number of reads for each sample in each lane plus the total reads for each lane. Example output: Lane 1 Total reads = 182851745 - KatyDobbs/KD-K1 79888058 43.7% - KatyDobbs/KD-K3 97854292 53.5% - Undetermined_indices/lane1 5109395 2.8% ... Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report lanes = self.lane_names for lane in lanes: lane_number = int(lane[1:]) samples = filter(lambda x: x['Read_number'] == 1 and bool(x[lane]), self._stats) try: total_reads = sum([int(s[lane]) for s in samples]) except Exception as ex: for s in samples: try: int(s[lane]) except ValueError: logging.critical("Bad value for read count in " "lane %s sample %s: '%s'" % (lane, s['Sample'], s[lane])) raise ex fpp.write("\nLane %d\n" % lane_number) fpp.write("Total reads = %d\n" % total_reads) for sample in samples: sample_name = "%s/%s" % (sample['Project'], sample['Sample']) nreads = float(sample[lane]) fpp.write("- %s\t%d\t%.1f%%\n" % (sample_name, nreads, nreads / total_reads * 100.0)) # Close file if fp is None and out_file is not None: fpp.close() def report_per_lane_summary_stats(self, out_file=None, fp=None): """ Report summary of total and unassigned reads per-lane Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Set up TabFile to hold the data collected per_lane_stats = TabFile(column_names=('Lane', 'Total reads', 'Assigned reads', 'Unassigned reads', '%assigned', '%unassigned')) # Initialise counts for each lane assigned = {} unassigned = {} for lane in self.lane_names: assigned[lane] = 0 unassigned[lane] = 0 # Count assigned and unassigned (= undetermined) reads for line in filter( lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[ 'Fastq']).is_index_read, self._stats): if line['Project'] != 'Undetermined_indices': counts = assigned else: counts = unassigned for lane in self.lane_names: if line[lane]: try: counts[lane] += line[lane] except KeyError: counts[lane] = line[lane] # Write out data for each lane for lane in self.lane_names: lane_number = int(lane[1:]) assigned_reads = assigned[lane] try: unassigned_reads = unassigned[lane] except KeyError: # lane doesn't have any unassigned reads unassigned_reads = 0 total_reads = assigned_reads + unassigned_reads if total_reads > 0: percent_assigned = float(assigned_reads)/ \ float(total_reads)*100.0 percent_unassigned = float(unassigned_reads)/ \ float(total_reads)*100.0 else: percent_assigned = 0.0 percent_unassigned = 0.0 per_lane_stats.append(data=("Lane %d" % lane_number, total_reads, assigned_reads, unassigned_reads, "%.2f" % percent_assigned, "%.2f" % percent_unassigned)) # Write to file per_lane_stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
def report_per_lane_summary_stats(self, out_file=None, fp=None): """ Report summary of total and unassigned reads per-lane Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Set up TabFile to hold the data collected per_lane_stats = TabFile(column_names=('Lane', 'Total reads', 'Assigned reads', 'Unassigned reads', '%assigned', '%unassigned')) # Initialise counts for each lane assigned = {} unassigned = {} for lane in self.lane_names: assigned[lane] = 0 unassigned[lane] = 0 # Count assigned and unassigned (= undetermined) reads for line in filter( lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[ 'Fastq']).is_index_read, self._stats): if line['Project'] != 'Undetermined_indices': counts = assigned else: counts = unassigned for lane in self.lane_names: if line[lane]: try: counts[lane] += line[lane] except KeyError: counts[lane] = line[lane] # Write out data for each lane for lane in self.lane_names: lane_number = int(lane[1:]) assigned_reads = assigned[lane] try: unassigned_reads = unassigned[lane] except KeyError: # lane doesn't have any unassigned reads unassigned_reads = 0 total_reads = assigned_reads + unassigned_reads if total_reads > 0: percent_assigned = float(assigned_reads)/ \ float(total_reads)*100.0 percent_unassigned = float(unassigned_reads)/ \ float(total_reads)*100.0 else: percent_assigned = 0.0 percent_unassigned = 0.0 per_lane_stats.append(data=("Lane %d" % lane_number, total_reads, assigned_reads, unassigned_reads, "%.2f" % percent_assigned, "%.2f" % percent_unassigned)) # Write to file per_lane_stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
if args.unassigned: # Count reads for barcodes not in list unassigned_reads = 0 unassigned_umis = set() if well_list is not None: expected_barcodes = well_list.barcodes() else: expected_barcodes = [l['Barcode'] for l in stats_data] for barcode in stats.barcodes(): if barcode not in expected_barcodes: unassigned_reads += stats.nreads(barcode=barcode) unassigned_umis.update( stats.distinct_umis(barcode=barcode)) # Check if 'unassigned' is already in stats file unassigned = stats_data.lookup('Barcode', 'Unassigned') try: data_line = unassigned[0] except IndexError: # Append the line data_line = stats_data.append() data_line['Barcode'] = 'Unassigned' data_line[nreads_col] = unassigned_reads data_line[umis_col] = len(unassigned_umis) # Write to file stats_data.write(filen=stats_file, include_header=True) # Report summary print "#barcodes : %s" % len(stats.barcodes()) print "#reads : %s" % stats.nreads() print "[%s] ICell8 stats completed" % time.strftime("%Y/%m/%d-%H:%M:%S")