def __init__(self,f,multiline=False): """ Create a new MetricsSummary instance Arguments: f (str): path to the 'metrics_summary.csv' file multiline (bool): if True then expect multiple lines of data (default is to expect a single line of data) """ # Multi-line summary? self._multiline = multiline # Read in data from the file with open(f,'rt') as fp: s = fp.read() self._data = dict() s = s.strip().split('\n') if not self._multiline and len(s) != 2: raise Exception("%s: MetricsSummary expects 2 lines (or specify " "multi-line mode)" % f) # Set up the tabfile instance TabFile.__init__(self, column_names=self._tokenise(s[0]), delimiter=',') # Add the data for line in s[1:]: self.append(data=self._tokenise(line))
def add_per_lane_statistics(self): """ Add a section with the per-lane statistics """ # Per-lane statistics if not os.path.exists(self._per_lane_stats_file): logger.debug("No per-lane statistics file found") return per_lane_stats = self.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(self._per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) self.add_to_toc("Per-lane statistics", per_lane_stats)
def __init__(self, filen=None, fp=None, name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen, 'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith( "# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0, "order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception, "Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order()
def __init__(self, well_list_file): """ Create a new ICell8WellList instance. Arguments: well_list_file (str): path to the well list file. """ self._data = TabFile(filen=well_list_file, first_line_is_header=True)
def __init__(self,f): """ Create a new AtacSummary instance Arguments: f (str): path to the 'summary.csv' file """ TabFile.__init__(self, filen=f, first_line_is_header=True, delimiter=',')
def __init__(self, summary_file=None): """ Create a new FastqcSummary instance """ TabFile.__init__(self, column_names=("Status", "Module", "File")) if summary_file: summary_file = os.path.abspath(summary_file) with open(summary_file, "r") as fp: for line in fp: line = line.strip() self.append(tabdata=line) self._summary_file = summary_file
def __init__(self, screen_file): """ Create a new FastqscreenData instance """ TabFile.__init__(self, column_names=( 'Library', '%Unmapped', '%One_hit_one_library', '%Multiple_hits_one_library', '%One_hit_multiple_libraries', '%Multiple_hits_multiple_libraries', )) self._screen_file = os.path.abspath(screen_file) self._version = None self._no_hits = None # Read in data with open(self._screen_file, 'r') as fp: for line in fp: line = line.strip() if line.startswith('#Fastq_screen version:'): self._version = line.split()[2] continue elif line.startswith('Library') or line.startswith('Genome'): tabfile = TabFile(column_names=line.split()) continue elif line.startswith('%Hit_no_libraries:') or \ line.startswith('%Hit_no_genomes:'): self._no_hits = float(line.split()[-1]) continue elif not line or \ line.startswith('#') or \ line.startswith('%'): continue tabfile.append(tabdata=line) # Handle different terminology for different versions if tabfile.header()[0] == 'Library': library = 'Library' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_library' multiple_hits_one_library = '%Multiple_hits_one_library' one_hit_multiple_libraries = '%One_hit_multiple_libraries' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries' elif tabfile.header()[0] == 'Genome': library = 'Genome' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_genome' multiple_hits_one_library = '%Multiple_hits_one_genome' one_hit_multiple_libraries = '%One_hit_multiple_genomes' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes' # Copy data to main object for line in tabfile: data = [ line[library], line[unmapped], line[one_hit_one_library], line[multiple_hits_one_library], line[one_hit_multiple_libraries], line[multiple_hits_multiple_libraries] ] self.append(data=data)
def __init__(self,fastq_strand_out): """ Create a new Fastqstrand instance """ self._fastq_strand_out = os.path.abspath(fastq_strand_out) self._version = None self._genomes = AttributeDictionary() # Read in data tabfile = None with open(self._fastq_strand_out,'r') as fp: for line in fp: line = line.strip() if line.startswith('#fastq_strand version:'): self._version = line.split()[2] continue elif line.startswith('#Genome'): tabfile = TabFile(column_names=line[1:].split('\t')) continue tabfile.append(tabdata=line) # Check there is some data if tabfile is None: raise Exception("Unable to extract fastq_strand data from %s" % self._fastq_strand_out) # Copy data to main object for line in tabfile: # Store the data data = AttributeDictionary() self._genomes[line['Genome']] = data data['forward'] = line['1st forward'] data['reverse'] = line['2nd reverse'] # Additional processing if data.reverse > 0.0: ratio = float(data.forward)/float(data.reverse) elif data.forward > 0.0: ratio = float("+inf") else: ratio = None if ratio is not None: if ratio < 0.2: strandedness = "reverse" elif ratio > 5 or ratio == float("+inf"): strandedness = "forward" else: strandedness = "unstranded?" else: strandedness = "undetermined" data['ratio'] = ratio data['strandedness'] = strandedness
def __init__(self, summary_file=None): """ Create a new FastqcSummary instance """ TabFile.__init__(self, column_names=( 'Status', 'Module', 'File', )) if summary_file: summary_file = os.path.abspath(summary_file) with open(summary_file, 'r') as fp: for line in fp: line = line.strip() self.append(tabdata=line) self._summary_file = summary_file
def __init__(self,filen=None,fp=None,name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen,'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith("# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0,"order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception,"Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order()
def __init__(self,screen_file): """ Create a new FastqscreenData instance """ TabFile.__init__(self, column_names=('Library', '%Unmapped', '%One_hit_one_library', '%Multiple_hits_one_library', '%One_hit_multiple_libraries', '%Multiple_hits_multiple_libraries',)) self._screen_file = os.path.abspath(screen_file) self._version = None self._no_hits = None # Read in data with open(self._screen_file,'r') as fp: for line in fp: line = line.strip() if line.startswith('#Fastq_screen version:'): self._version = line.split()[2] continue elif line.startswith('Library') or line.startswith('Genome'): tabfile = TabFile(column_names=line.split()) continue elif line.startswith('%Hit_no_libraries:') or \ line.startswith('%Hit_no_genomes:'): self._no_hits = float(line.split()[-1]) continue elif not line or \ line.startswith('#') or \ line.startswith('%'): continue tabfile.append(tabdata=line) # Handle different terminology for different versions if tabfile.header()[0] == 'Library': library = 'Library' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_library' multiple_hits_one_library = '%Multiple_hits_one_library' one_hit_multiple_libraries = '%One_hit_multiple_libraries' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_libraries' elif tabfile.header()[0] == 'Genome': library = 'Genome' unmapped = '%Unmapped' one_hit_one_library = '%One_hit_one_genome' multiple_hits_one_library = '%Multiple_hits_one_genome' one_hit_multiple_libraries = '%One_hit_multiple_genomes' multiple_hits_multiple_libraries = '%Multiple_hits_multiple_genomes' # Copy data to main object for line in tabfile: data = [line[library], line[unmapped], line[one_hit_one_library], line[multiple_hits_one_library], line[one_hit_multiple_libraries], line[multiple_hits_multiple_libraries]] self.append(data=data)
def __init__(self,screen_file): """ Create a new FastqscreenData instance """ TabFile.__init__(self, column_names=('Library', '%Unmapped', '%One_hit_one_library', '%Multiple_hits_one_library', '%One_hit_multiple_libraries', '%Multiple_hits_multiple_libraries',)) self._screen_file = os.path.abspath(screen_file) self._version = None self._no_hits = None # Read in data with open(self._screen_file,'r') as fp: for line in fp: line = line.strip() if line.startswith('#Fastq_screen version:'): self._version = line.split()[2] continue elif line.startswith('Library'): tabfile = TabFile(column_names=line.split()) continue elif line.startswith('%Hit_no_libraries:'): self._no_hits = float(line.split()[-1]) continue elif not line or \ line.startswith('#') or \ line.startswith('%'): continue tabfile.append(tabdata=line) # Move data to main object for line in tabfile: data = [] for col in self.header(): data.append(line[col]) self.append(data=data)
class ICell8WellList: """ Class representing an ICELL8 well list file The file is tab-delimited and consists of an uncommented header line which lists the fields ('Row','Col','Candidate',...), followed by lines of data. The key columns are 'Sample' (gives the cell type) and 'Barcode' (the inline barcode sequence). """ def __init__(self,well_list_file): """ Create a new ICell8WellList instance. Arguments: well_list_file (str): path to the well list file. """ self._data = TabFile(filen=well_list_file, first_line_is_header=True) def barcodes(self): """ Return a list of barcodes """ return [x['Barcode'] for x in self._data] def samples(self): """ Return a list of samples """ samples = set([x['Sample'] for x in self._data]) return sorted(list(samples)) def sample(self,barcode): """ Return sample (=cell type) corresponding to barcode """ samples = self._data.lookup('Barcode',barcode) try: return samples[0]['Sample'] except IndexError: raise KeyError("Failed to locate sample for '%s'" % barcode)
def report_basic_stats(self,out_file=None,fp=None): """ Report the 'basic' statistics For each FASTQ file, report the following information: - Project name - Sample name - FASTQ file name (without leading directory) - Size (human-readable) - Nreads (number of reads) - Paired_end ('Y' for paired-end, 'N' for single-end) Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file,'w') else: fpp = fp # Report stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) for line in self._stats: data = [line[c] for c in stats.header()] stats.append(data=data) stats.write(fp=fpp,include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
# Remove the working directory shutil.rmtree(working_dir) # Report the stats if args.stats_file is not None: # Output column names stats_file = os.path.abspath(args.stats_file) nreads_col = "Nreads%s" % ('' if args.suffix is None else args.suffix) umis_col = "Distinct_UMIs%s" % ('' if args.suffix is None else args.suffix) if not (os.path.isfile(stats_file) and args.append): # Create new stats file if well_list is not None: # Initialise barcode and sample names from well list stats_data = TabFile(column_names=('Barcode', 'Sample')) for barcode in well_list.barcodes(): stats_data.append(data=(barcode, well_list.sample(barcode))) else: # Barcodes from collected data stats_data = TabFile(column_names=('Barcode', )) for barcode in stats.barcodes(): stats_data.append(data=(barcode, )) else: # Append to an existing file stats_data = TabFile(filen=stats_file, first_line_is_header=True) # Add new columns of data stats_data.appendColumn(nreads_col) stats_data.appendColumn(umis_col) # Populate columns
class MacsXLS: """Class for reading and manipulating XLS output from MACS Reads the XLS output file from the MACS peak caller and processes and stores the information for subsequent manipulation and output. To read in data from a MACS output file: >>> macs = MacsXLS("macs.xls") This reads in the data and prepends an additional 'order' column (a list of numbers from one to the number of data lines). To get the MACS version: >>> macs.macs_version 2.0.10 To access the 'header' information (as a Python list): >>> macs.header To see the column names (as a Python list): >>> macs.columns The data is stored as a TabFile object; to access the data use the 'data' property, e.g. >>> for line in macs.data: ... print "Chr %s Start %s End" % (line['chr'],line['start'],line['end']) To sort the data on a particular column use the 'sort_on' method, e.g. >>> macs.sort_on('chr') (Note that the order column is always recalculated after sorting.) """ def __init__(self,filen=None,fp=None,name=None): """Create a new MacsXLS instance Arguments: filen: name of the file to read the MACS output from. If None then fp argument must be supplied instead. fp: file-like object opened for reading. If None then filen argument must be supplied instead. If both filen and fp are supplied then fp will be used preferentially. """ # Store data self.__filen = filen self.__name = name self.__macs_version = None self.__command_line = None self.__header = [] self.__data = None # Open file, if necessary if fp is None: fp = open(filen,'r') else: filen = None # Iterate over header lines for line in fp: line = line.strip() if line.startswith('#') or line == '': # Header line self.__header.append(line) # Detect/extract data from header if line.startswith("# This file is generated by MACS version "): # Look for MACS version self.__macs_version = line.split()[8] elif self.__name is None and line.startswith("# name = "): # Look for 'name' if none set self.__name = line[len("# name = "):] elif line.startswith("# Command line: "): # Look for command line self.__command_line = line[16:] else: if self.__data is None: # First line of actual data should be the column names columns = line.split('\t') # Insert an additional column called 'order' columns.insert(0,"order") # Set up TabFile to handle actual data self.__data = TabFile(column_names=columns) else: # Assume it's actual data and store it self.__data.append(tabdata="\t%s" % line) # Close the file handle, if we opened it if filen is not None: fp.close() # Check that we actually got a version line if self.macs_version is None: raise Exception,"Failed to extract MACS version, not a MACS output file?" # Populate the 'order' column self.update_order() @property def filen(self): """Return the source file name """ return self.__filen @property def name(self): """Return the name property """ return self.__name @property def macs_version(self): """Return the MACS version extracted from the file """ return self.__macs_version @property def command_line(self): """Return the command line string extracted from the header This is the value associated with the "# Command line: ..." header line. Will be 'None' if no matching header line is found, else is the string following the ':'. """ return self.__command_line @property def columns(self): """Return the column names for the MACS data Returns a list of the column names from the data extracted from the file. """ return self.__data.header() @property def columns_as_xls_header(self): """Returns the column name list, with hash prepended """ return ['#'+self.columns[0]] + self.columns[1:] @property def header(self): """Return the header data from the file Returns a list of lines comprising the header extracted from the file. """ return self.__header @property def data(self): """Return the data from the file Returns a TabFile object comprising the data extracted from the file. """ return self.__data @property def with_broad_option(self): """Returns True if MACS was run with --broad option If --broad wasn't detected then returns False. """ if self.macs_version.startswith('1.'): # Not an option in MACS 1.* return False try: # Was --broad specified in the command line? return '--broad' in self.command_line.split() except AttributeError: # No command line? Check for 'abs_summit' column return 'abs_summit' not in self.columns def sort_on(self,column,reverse=True): """Sort data on specified column Sorts the data in-place, by the specified column. By default data is sorted in descending order; set 'reverse' argument to False to sort values in ascending order instead Note that the 'order' column is automatically updated after each sorting operation. Arguments: column: name of the column to sort on reverse: if True (default) then sort in descending order (i.e. largest to smallest). Otherwise sort in ascending order. """ # Sort the data self.__data.sort(lambda line: line[column],reverse=reverse) # Update the 'order' column self.update_order() def update_order(self): # Set/update values in 'order' column for i in range(0,len(self.__data)): self.__data[i]['order'] = i+1
def add_per_fastq_statistics(self): """ Add a section with the per-Fastq statistics """ # Per fastq statistics if not os.path.exists(self._stats_file): logger.debug("No per-Fastq statistics file found") return per_file_stats = self.add_section("Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(self._stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = [c for c in stats.header() if c.startswith('L')] sample = None for project in projects: # Get subset of lines for this project subset = sorted([d for d in stats if d['Project'] == project], key=lambda l: split_sample_name(l['Sample'])) # Determine which lanes this project appears in subset_lanes = [] for l in lanes: for d in subset: if d[l]: subset_lanes.append(l) break # Add a new section for this project s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) # Check for problems has_warnings = False for line in subset: nreads = [line[l] for l in subset_lanes if line[l] != ''] if not nreads or min(nreads) == 0: s.add( self.warning("One or more Fastqs with zero read " "counts in one or more lanes")) has_warnings = True break # Add link to project from ToC link = Link("%s" % project, s) if not has_warnings: project_toc_list.add_item(link) else: project_toc_list.add_item(WarningIcon(), link) self.flag_warnings() # Build the data of data tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') nreads = [line[l] for l in subset_lanes if line[l] != ''] if not nreads: nreads = [ 0, ] if min(nreads) == 0: # Add warning icon to Fastq with no reads in # at least one lane data['Fastq'] = Para(WarningIcon(), data['Fastq']) barplot = ustackedbar(nreads, length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) # Add to table of contents self.add_to_toc("Per-file statistics by project", per_file_stats, project_toc_list)
def report_processing_qc(analysis_dir, html_file): """ Generate HTML report for processing statistics Arguments: analysis_dir (AnalysisDir): html_file (str): destination path and file name for HTML report """ # Initialise the HTML report processing_qc = Document("Processing report for %s" % os.path.basename(analysis_dir.analysis_dir)) processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES) processing_qc.add_css_rule("table { font-size: 80%;\n" " font-family: sans-serif; }") processing_qc.add_css_rule("td { text-align: right; }") # Add table of contents toc = processing_qc.add_section("Contents", name="toc") toc_list = List() toc.add(toc_list) # Per-lane statistics per_lane_stats_file = analysis_dir.params.per_lane_stats_file if per_lane_stats_file is None: per_lane_stats_file = "per_lane_statistics.info" if os.path.exists(per_lane_stats_file): per_lane_stats = processing_qc.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) toc_list.add_item(Link("Per-lane statistics", per_lane_stats)) # Per lane by sample statistics per_lane_sample_stats_file = "per_lane_sample_stats.info" if os.path.exists(per_lane_sample_stats_file): per_lane_sample_stats = processing_qc.add_section( "Per-lane statistics by sample", name="per_lane_sample_stats") lane_toc_list = List() per_lane_sample_stats.add(lane_toc_list) # Store the data for each lane with open("per_lane_sample_stats.info") as stats: lane_data = [] for line in stats: if line.startswith("Lane "): lane = int(line.split(' ')[-1]) lane_data.append({ 'lane': lane, 'total_reads': None, 'samples': [] }) elif line.startswith("Total reads = "): total_reads = int(line.split('=')[-1].strip()) lane_data[-1]['total_reads'] = total_reads elif line.startswith("- "): pname = line.split()[1].split('/')[0] sname = line.split()[1].split('/')[1] nreads = int(line.split()[2]) percreads = line.split()[3] lane_data[-1]['samples'].append({ 'pname': pname, 'sname': sname, 'nreads': nreads, 'percreads': percreads }) # Create a section and table for each lane for data in lane_data: lane = data['lane'] max_reads = max([d['nreads'] for d in data['samples']]) total_reads = data['total_reads'] s = per_lane_sample_stats.add_subsection( "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane) lane_toc_list.add_item(Link("Lane %d" % lane, s)) current_project = None tbl = Table( columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'), pname='Project', sname='Sample', nreads='Nreads', percreads='%reads', barplot='', ) s.add(tbl) for sample in data['samples']: pname = sample['pname'] sname = sample['sname'] nreads = sample['nreads'] percreads = sample['percreads'] if pname == current_project: pname = " " else: current_project = pname barplot = ustackedbar((nreads, max_reads - nreads), length=100, height=5, colors=('black', 'lightgrey'), bbox=False, inline=True) tbl.add_row(pname=pname, sname=sname, nreads=pretty_print_reads(nreads), percreads=percreads, barplot=Img(barplot)) tbl.add_row(pname="Total reads for lane %d" % lane, nreads=pretty_print_reads(total_reads)) toc_list.add_item( Link("Per-lane statistics by sample", per_lane_sample_stats), lane_toc_list) # Per fastq statistics stats_file = "statistics_full.info" if not os.path.exists(stats_file): if analysis_dir.params.stats_file is not None: stats_file = analysis_dir.params.stats_file else: stats_file = "statistics.info" if os.path.exists(stats_file): per_file_stats = processing_qc.add_section( "Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = filter(lambda c: c.startswith('L'), stats.header()) sample = None for project in projects: subset = filter(lambda d: d['Project'] == project, stats) subset_lanes = filter( lambda l: reduce(lambda x, y: x or bool(y), [d[l] for d in subset], False), lanes) s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) project_toc_list.add_item(Link("%s" % project, s)) tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') barplot = ustackedbar(filter(lambda n: n != '', [line[l] for l in subset_lanes]), length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) toc_list.add_item( Link("Per-file statistics by project", per_file_stats), project_toc_list) # Write the processing QC summary file processing_qc.write(html_file)
# Build output file name: if not explicitly supplied on the command # line then use "XLS_<input_name>.xls" if len(args) == 2: xls_out = args[1] else: # MACS output file might already have an .xls extension # but we'll add an explicit .xls extension xls_out = "XLS_"+os.path.splitext(os.path.basename(macs_in))[0]+".xls" print "Input file: %s" % macs_in print "Output XLS: %s" % xls_out # Extract the header from the MACS and feed actual data to # TabFile object header = [] data = TabFile(column_names=['chr','start','end','length','summit','tags', '-10*log10(pvalue)','fold_enrichment','FDR(%)']) fp = open(macs_in,'r') for line in fp: if line.startswith('#') or line.strip() == '': # Header line header.append(line.strip()) else: # Data data.append(tabdata=line.strip()) fp.close() # Temporarily remove first line header_line = str(data[0]) del(data[0]) # Attempt to detect MACS version
# line then use "XLS_<input_name>.xls" if len(args) == 2: xls_out = args[1] else: # MACS output file might already have an .xls extension # but we'll add an explicit .xls extension xls_out = "XLS_" + os.path.splitext( os.path.basename(macs_in))[0] + ".xls" print "Input file: %s" % macs_in print "Output XLS: %s" % xls_out # Extract the header from the MACS and feed actual data to # TabFile object header = [] data = TabFile(column_names=[ 'chr', 'start', 'end', 'length', 'summit', 'tags', '-10*log10(pvalue)', 'fold_enrichment', 'FDR(%)' ]) fp = open(macs_in, 'r') for line in fp: if line.startswith('#') or line.strip() == '': # Header line header.append(line.strip()) else: # Data data.append(tabdata=line.strip()) fp.close() # Temporarily remove first line header_line = str(data[0]) del (data[0])
def _get_data(self, filen=None): """ Collect statistics for FASTQ outputs from an Illumina run """ # Collect FASTQ files fastqstats = [] for project in self._illumina_data.projects: for sample in project.samples: for fastq in sample.fastq: fastqstats.append( FastqStats(os.path.join(sample.dirn, fastq), project.name, sample.name)) # Gather same information for undetermined reads (if present) if self._illumina_data.undetermined is not None: for lane in self._illumina_data.undetermined.samples: for fastq in lane.fastq: fastqstats.append( FastqStats(os.path.join(lane.dirn, fastq), self._illumina_data.undetermined.name, lane.name)) # Collect the data for each file if self._n_processors > 1: # Multiple cores pool = Pool(self._n_processors) results = pool.map(collect_fastq_data, fastqstats) pool.close() pool.join() else: # Single core results = map(collect_fastq_data, fastqstats) # Set up tabfile to hold pre-existing data if filen is not None: existing_stats = TabFile(filen, first_line_is_header=True) else: existing_stats = None # Set up class to hold all collected data self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end', 'Read_number')) # Split result sets into R1 and R2 results_r1 = filter(lambda f: f.read_number == 1, results) results_r2 = filter(lambda f: f.read_number == 2, results) # Determine which lanes are present and append # columns for each lanes = set() for fastq in results_r1: logger.debug("-- %s: lanes %s" % (fastq.name, ','.join([str(l) for l in fastq.lanes]))) for lane in fastq.lanes: lanes.add(lane) # Add lane numbers from pre-existing stats file if existing_stats is not None: for c in existing_stats.header(): if c.startswith('L'): lanes.add(int(c[1:])) self._lanes = sorted(list(lanes)) logger.debug("Lanes found: %s" % ','.join([str(l) for l in self._lanes])) for lane in self._lanes: self._stats.appendColumn("L%s" % lane) # Copy pre-existing stats into new tabfile if existing_stats: for line in existing_stats: data = [ line['Project'], line['Sample'], line['Fastq'], line['Size'], line['Nreads'], line['Paired_end'], line['Read_number'] ] for lane in lanes: try: data.append(line["L%s" % lane]) except: data.append('') self._stats.append(data=data) # Copy reads per lane from R1 FASTQs into R2 for r2_fastq in results_r2: # Get corresponding R1 name logger.debug("-- Fastq R2: %s" % r2_fastq.name) r1_fastq_name = IlluminaFastq(r2_fastq.name) r1_fastq_name.read_number = 1 r1_fastq_name = str(r1_fastq_name) logger.debug("-- -> R1: %s" % r1_fastq_name) # Locate corresponding data r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name), results_r1)[0] r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane) # Write the data into the tabfile paired_end = ('Y' if self._illumina_data.paired_end else 'N') for fastq in results: # Check for existing entry existing_entry = False for line in self._stats: if (line['Project'] == fastq.project and line['Sample'] == fastq.sample and line['Fastq'] == fastq.name): # Overwrite the existing entry existing_entry = True break # Write the data if not existing_entry: # Append new entry data = [ fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, paired_end, fastq.read_number ] for lane in lanes: try: data.append(fastq.reads_by_lane[lane]) except: data.append('') self._stats.append(data=data) else: # Overwrite existing entry logging.warning("Overwriting exisiting entry for " "%s/%s/%s" % (fastq.project, fastq.sample, fastq.name)) line['Size'] = bcf_utils.format_file_size(fastq.fsize) line['Nreads'] = fastq.nreads line['Paired_end'] = paired_end line['Read_number'] = fastq.read_number for lane in lanes: lane_name = "L%d" % lane try: line[lane_name] = fastq.reads_by_lane[lane] except: line[lane_name] = ''
class FastqStatistics: """ Class for collecting and reporting stats on Illumina FASTQs Given a directory with fastq(.gz) files arranged in the same structure as the output from bcl2fastq or bcl2fastq2, collects statistics for each file and provides methods for reporting different aspects. Example usage: >>> from IlluminaData import IlluminaData >>> data = IlluminaData('120117_BLAH_JSHJHXXX','bcl2fastq') >>> stats = FastqStatistics(data) >>> stats.report_basic_stats('basic_stats.out') """ def __init__(self, illumina_data, n_processors=1, add_to=None): """ Create a new FastqStatistics instance Arguments: illumina_data: populated IlluminaData object describing the run. n_processors: number of processors to use (if >1 then uses the multiprocessing library to run the statistics gathering using multiple cores). add_to: optional, add the data to that from an existing statistics file """ self._illumina_data = illumina_data self._n_processors = n_processors self._stats = None self._lane_names = [] self._get_data(filen=add_to) def _get_data(self, filen=None): """ Collect statistics for FASTQ outputs from an Illumina run """ # Collect FASTQ files fastqstats = [] for project in self._illumina_data.projects: for sample in project.samples: for fastq in sample.fastq: fastqstats.append( FastqStats(os.path.join(sample.dirn, fastq), project.name, sample.name)) # Gather same information for undetermined reads (if present) if self._illumina_data.undetermined is not None: for lane in self._illumina_data.undetermined.samples: for fastq in lane.fastq: fastqstats.append( FastqStats(os.path.join(lane.dirn, fastq), self._illumina_data.undetermined.name, lane.name)) # Collect the data for each file if self._n_processors > 1: # Multiple cores pool = Pool(self._n_processors) results = pool.map(collect_fastq_data, fastqstats) pool.close() pool.join() else: # Single core results = map(collect_fastq_data, fastqstats) # Set up tabfile to hold pre-existing data if filen is not None: existing_stats = TabFile(filen, first_line_is_header=True) else: existing_stats = None # Set up class to hold all collected data self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end', 'Read_number')) # Split result sets into R1 and R2 results_r1 = filter(lambda f: f.read_number == 1, results) results_r2 = filter(lambda f: f.read_number == 2, results) # Determine which lanes are present and append # columns for each lanes = set() for fastq in results_r1: logger.debug("-- %s: lanes %s" % (fastq.name, ','.join([str(l) for l in fastq.lanes]))) for lane in fastq.lanes: lanes.add(lane) # Add lane numbers from pre-existing stats file if existing_stats is not None: for c in existing_stats.header(): if c.startswith('L'): lanes.add(int(c[1:])) self._lanes = sorted(list(lanes)) logger.debug("Lanes found: %s" % ','.join([str(l) for l in self._lanes])) for lane in self._lanes: self._stats.appendColumn("L%s" % lane) # Copy pre-existing stats into new tabfile if existing_stats: for line in existing_stats: data = [ line['Project'], line['Sample'], line['Fastq'], line['Size'], line['Nreads'], line['Paired_end'], line['Read_number'] ] for lane in lanes: try: data.append(line["L%s" % lane]) except: data.append('') self._stats.append(data=data) # Copy reads per lane from R1 FASTQs into R2 for r2_fastq in results_r2: # Get corresponding R1 name logger.debug("-- Fastq R2: %s" % r2_fastq.name) r1_fastq_name = IlluminaFastq(r2_fastq.name) r1_fastq_name.read_number = 1 r1_fastq_name = str(r1_fastq_name) logger.debug("-- -> R1: %s" % r1_fastq_name) # Locate corresponding data r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name), results_r1)[0] r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane) # Write the data into the tabfile paired_end = ('Y' if self._illumina_data.paired_end else 'N') for fastq in results: # Check for existing entry existing_entry = False for line in self._stats: if (line['Project'] == fastq.project and line['Sample'] == fastq.sample and line['Fastq'] == fastq.name): # Overwrite the existing entry existing_entry = True break # Write the data if not existing_entry: # Append new entry data = [ fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, paired_end, fastq.read_number ] for lane in lanes: try: data.append(fastq.reads_by_lane[lane]) except: data.append('') self._stats.append(data=data) else: # Overwrite existing entry logging.warning("Overwriting exisiting entry for " "%s/%s/%s" % (fastq.project, fastq.sample, fastq.name)) line['Size'] = bcf_utils.format_file_size(fastq.fsize) line['Nreads'] = fastq.nreads line['Paired_end'] = paired_end line['Read_number'] = fastq.read_number for lane in lanes: lane_name = "L%d" % lane try: line[lane_name] = fastq.reads_by_lane[lane] except: line[lane_name] = '' @property def lane_names(self): """ Return list of lane names (e.g. ['L1','L2',...]) """ return [("L%d" % l) for l in self._lanes] @property def raw(self): """ Return the 'raw' statistics TabFile instance """ return self._stats def report_full_stats(self, out_file=None, fp=None): """ Report all statistics gathered for all FASTQs Essentially a dump of all the data. Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report self._stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close() def report_basic_stats(self, out_file=None, fp=None): """ Report the 'basic' statistics For each FASTQ file, report the following information: - Project name - Sample name - FASTQ file name (without leading directory) - Size (human-readable) - Nreads (number of reads) - Paired_end ('Y' for paired-end, 'N' for single-end) Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) for line in self._stats: data = [line[c] for c in stats.header()] stats.append(data=data) stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close() def report_per_lane_sample_stats(self, out_file=None, fp=None): """ Report of reads per sample in each lane Reports the number of reads for each sample in each lane plus the total reads for each lane. Example output: Lane 1 Total reads = 182851745 - KatyDobbs/KD-K1 79888058 43.7% - KatyDobbs/KD-K3 97854292 53.5% - Undetermined_indices/lane1 5109395 2.8% ... Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Report lanes = self.lane_names for lane in lanes: lane_number = int(lane[1:]) samples = filter(lambda x: x['Read_number'] == 1 and bool(x[lane]), self._stats) try: total_reads = sum([int(s[lane]) for s in samples]) except Exception as ex: for s in samples: try: int(s[lane]) except ValueError: logging.critical("Bad value for read count in " "lane %s sample %s: '%s'" % (lane, s['Sample'], s[lane])) raise ex fpp.write("\nLane %d\n" % lane_number) fpp.write("Total reads = %d\n" % total_reads) for sample in samples: sample_name = "%s/%s" % (sample['Project'], sample['Sample']) nreads = float(sample[lane]) fpp.write("- %s\t%d\t%.1f%%\n" % (sample_name, nreads, nreads / total_reads * 100.0)) # Close file if fp is None and out_file is not None: fpp.close() def report_per_lane_summary_stats(self, out_file=None, fp=None): """ Report summary of total and unassigned reads per-lane Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Set up TabFile to hold the data collected per_lane_stats = TabFile(column_names=('Lane', 'Total reads', 'Assigned reads', 'Unassigned reads', '%assigned', '%unassigned')) # Initialise counts for each lane assigned = {} unassigned = {} for lane in self.lane_names: assigned[lane] = 0 unassigned[lane] = 0 # Count assigned and unassigned (= undetermined) reads for line in filter( lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[ 'Fastq']).is_index_read, self._stats): if line['Project'] != 'Undetermined_indices': counts = assigned else: counts = unassigned for lane in self.lane_names: if line[lane]: try: counts[lane] += line[lane] except KeyError: counts[lane] = line[lane] # Write out data for each lane for lane in self.lane_names: lane_number = int(lane[1:]) assigned_reads = assigned[lane] try: unassigned_reads = unassigned[lane] except KeyError: # lane doesn't have any unassigned reads unassigned_reads = 0 total_reads = assigned_reads + unassigned_reads if total_reads > 0: percent_assigned = float(assigned_reads)/ \ float(total_reads)*100.0 percent_unassigned = float(unassigned_reads)/ \ float(total_reads)*100.0 else: percent_assigned = 0.0 percent_unassigned = 0.0 per_lane_stats.append(data=("Lane %d" % lane_number, total_reads, assigned_reads, unassigned_reads, "%.2f" % percent_assigned, "%.2f" % percent_unassigned)) # Write to file per_lane_stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
def report_per_lane_summary_stats(self, out_file=None, fp=None): """ Report summary of total and unassigned reads per-lane Arguments: out_file (str): name of file to write report to (used if 'fp' is not supplied) fp (File): File-like object open for writing (defaults to stdout if 'out_file' also not supplied) """ # Determine output stream if fp is None: if out_file is None: fpp = sys.stdout else: fpp = open(out_file, 'w') else: fpp = fp # Set up TabFile to hold the data collected per_lane_stats = TabFile(column_names=('Lane', 'Total reads', 'Assigned reads', 'Unassigned reads', '%assigned', '%unassigned')) # Initialise counts for each lane assigned = {} unassigned = {} for lane in self.lane_names: assigned[lane] = 0 unassigned[lane] = 0 # Count assigned and unassigned (= undetermined) reads for line in filter( lambda x: x['Read_number'] == 1 and not IlluminaFastq(x[ 'Fastq']).is_index_read, self._stats): if line['Project'] != 'Undetermined_indices': counts = assigned else: counts = unassigned for lane in self.lane_names: if line[lane]: try: counts[lane] += line[lane] except KeyError: counts[lane] = line[lane] # Write out data for each lane for lane in self.lane_names: lane_number = int(lane[1:]) assigned_reads = assigned[lane] try: unassigned_reads = unassigned[lane] except KeyError: # lane doesn't have any unassigned reads unassigned_reads = 0 total_reads = assigned_reads + unassigned_reads if total_reads > 0: percent_assigned = float(assigned_reads)/ \ float(total_reads)*100.0 percent_unassigned = float(unassigned_reads)/ \ float(total_reads)*100.0 else: percent_assigned = 0.0 percent_unassigned = 0.0 per_lane_stats.append(data=("Lane %d" % lane_number, total_reads, assigned_reads, unassigned_reads, "%.2f" % percent_assigned, "%.2f" % percent_unassigned)) # Write to file per_lane_stats.write(fp=fpp, include_header=True) # Close file if fp is None and out_file is not None: fpp.close()
def report_processing_qc(analysis_dir, html_file): """ Generate HTML report for processing statistics Arguments: analysis_dir (AutoProcess): AutoProcess instance for the directory to report the processing from html_file (str): destination path and file name for HTML report """ # Initialise the HTML report processing_qc = Document("Processing report for %s" % os.path.basename(analysis_dir.analysis_dir)) processing_qc.add_css_rule(css_rules.QC_REPORT_CSS_RULES) processing_qc.add_css_rule("table { font-size: 80%;\n" " font-family: sans-serif; }") processing_qc.add_css_rule("td { text-align: right; }") processing_qc.add_css_rule("p.warning { padding: 5px;\n" " border: solid 1px red;\n" " background-color: F5BCA9;\n" " color: red;\n" " font-weight: bold;\n" " border-radius: 10px;\n" " display: inline-block; }") processing_qc.add_css_rule(".warnings { padding: 2px;\n" " border: solid 3px red;\n" " background-color: F5BCA9;\n" " color: red;\n" " font-weight: bold;\n" " margin: 10px;\n" " border-radius: 10px;\n" " display: inline-block; }") processing_qc.add_css_rule("img { vertical-align: middle; }") processing_qc.add_css_rule(".hide { display: none; }") # Add table of contents toc = processing_qc.add_section("Contents", name="toc") toc_list = List() toc.add(toc_list) # Add warnings section # This will be hidden if there are no issues status = True warnings = processing_qc.add_section(css_classes=("warnings", )) warnings.add( Para(WarningIcon(size=50), "There are issues with one or more lanes or samples")) # Per-lane statistics per_lane_stats_file = analysis_dir.params.per_lane_stats_file if per_lane_stats_file is None: per_lane_stats_file = "per_lane_statistics.info" per_lane_stats_file = get_absolute_file_path( per_lane_stats_file, base=analysis_dir.analysis_dir) if os.path.exists(per_lane_stats_file): per_lane_stats = processing_qc.add_section("Per-lane statistics", name="per_lane_stats") stats = TabFile(per_lane_stats_file, first_line_is_header=True) tbl = Table(columns=stats.header()) tbl.append_columns("Assigned/unassigned") for line in stats: n = tbl.add_row() for c in stats.header(): if c in ("Total reads", "Assigned reads", "Unassigned reads"): value = pretty_print_reads(line[c]) else: value = line[c] tbl.set_value(n, c, value) tbl.set_value( n, "Assigned/unassigned", Img( ustackedbar( (line["Assigned reads"], line["Unassigned reads"]), length=100, height=15, colors=('red', 'white'), inline=True))) per_lane_stats.add(tbl) toc_list.add_item(Link("Per-lane statistics", per_lane_stats)) # Per lane by sample statistics per_lane_sample_stats_file = get_absolute_file_path( "per_lane_sample_stats.info", base=analysis_dir.analysis_dir) if os.path.exists(per_lane_sample_stats_file): per_lane_sample_stats = processing_qc.add_section( "Per-lane statistics by sample", name="per_lane_sample_stats") lane_toc_list = List() per_lane_sample_stats.add(lane_toc_list) # Store the data for each lane lane_data = list() with open(per_lane_sample_stats_file, 'r') as stats: for line in stats: if line.startswith("Lane "): lane = int(line.split(' ')[-1]) lane_data.append({ 'lane': lane, 'total_reads': None, 'samples': [] }) elif line.startswith("Total reads = "): total_reads = int(line.split('=')[-1].strip()) lane_data[-1]['total_reads'] = total_reads elif line.startswith("- "): pname = line.split()[1].split('/')[0] sname = line.split()[1].split('/')[1] nreads = int(line.split()[2]) percreads = line.split()[3] lane_data[-1]['samples'].append({ 'pname': pname, 'sname': sname, 'nreads': nreads, 'percreads': percreads }) # Create a section and table for each lane for data in lane_data: lane = data['lane'] s = per_lane_sample_stats.add_subsection( "Lane %d" % lane, name="per_lane_sample_stats_lane%d" % lane) # Check for problems has_warnings = False if not data['samples']: # No samples reported s.add( Para(WarningIcon(), "No samples reported for this lane", css_classes=('warning', ))) has_warnings = True elif min([d['nreads'] for d in data['samples']]) == 0: # There are samples with no reads s.add( Para(WarningIcon(), "One or more samples with no reads", css_classes=('warning', ))) has_warnings = True # Add link to lane for lane ToC link = Link("Lane %d" % lane, s) if not has_warnings: lane_toc_list.add_item(link) else: lane_toc_list.add_item(WarningIcon(), link) status = False # Write out the data, if there is any if not data['samples']: continue max_reads = max([d['nreads'] for d in data['samples']]) total_reads = data['total_reads'] current_project = None tbl = Table( columns=('pname', 'sname', 'nreads', 'percreads', 'barplot'), pname='Project', sname='Sample', nreads='Nreads', percreads='%reads', barplot='', ) s.add(tbl) # Sort the sample data into order of sample name samples = sorted([s for s in data['samples']], key=lambda s: split_sample_name(s['sname'])) # Write the table for sample in samples: pname = sample['pname'] sname = sample['sname'] nreads = sample['nreads'] percreads = sample['percreads'] if pname == current_project: pname = " " else: current_project = pname barplot = ustackedbar((nreads, max_reads - nreads), length=100, height=5, colors=('black', 'lightgrey'), bbox=False, inline=True) if nreads == 0: sname = Para(WarningIcon(), sname) tbl.add_row(pname=pname, sname=sname, nreads=pretty_print_reads(nreads), percreads=percreads, barplot=Img(barplot)) tbl.add_row(pname="Total reads for lane %d" % lane, nreads=pretty_print_reads(total_reads)) # Add link to section from main ToC toc_list.add_item( Link("Per-lane statistics by sample", per_lane_sample_stats), lane_toc_list) # Per fastq statistics stats_file = get_absolute_file_path("statistics_full.info", base=analysis_dir.analysis_dir) if not os.path.exists(stats_file): if analysis_dir.params.stats_file is not None: stats_file = analysis_dir.params.stats_file else: stats_file = "statistics.info" stats_file = get_absolute_file_path(stats_file, base=analysis_dir.analysis_dir) if os.path.exists(stats_file): per_file_stats = processing_qc.add_section( "Per-file statistics by project", name="per_file_stats") project_toc_list = List() per_file_stats.add(project_toc_list) stats = TabFile(stats_file, first_line_is_header=True) projects = sorted(list(set([d['Project'] for d in stats]))) lanes = filter(lambda c: c.startswith('L'), stats.header()) sample = None for project in projects: # Get subset of lines for this project subset = sorted(filter(lambda d: d['Project'] == project, stats), key=lambda l: split_sample_name(l['Sample'])) # Work out which lanes are included subset_lanes = filter( lambda l: reduce(lambda x, y: x or bool(y), [d[l] for d in subset], False), lanes) # Add a new section for this project s = per_file_stats.add_subsection("%s" % project, name="per_file_stats_%s" % project) # Check for problems has_warnings = False for line in subset: nreads = filter(lambda n: n != '', [line[l] for l in subset_lanes]) if not nreads or min(nreads) == 0: s.add( Para(WarningIcon(), "One or more Fastqs with zero " "read counts in one or lanes", css_classes=('warning', ))) has_warnings = True break # Add link to project from ToC link = Link("%s" % project, s) if not has_warnings: project_toc_list.add_item(link) else: project_toc_list.add_item(WarningIcon(), link) status = False # Build the data of data tbl = Table(columns=('Sample', 'Fastq', 'Size')) if subset_lanes: tbl.append_columns(*subset_lanes) tbl.append_columns('Barplot', 'Nreads') s.add(tbl) for line in subset: if sample == line['Sample']: sname = " " else: sample = line['Sample'] sname = sample data = { 'Sample': sname, 'Fastq': line['Fastq'], 'Size': line['Size'], 'Nreads': (pretty_print_reads(line['Nreads']) if line['Nreads'] != '' else '') } for l in subset_lanes: data[l] = (pretty_print_reads(line[l]) if line[l] != '' else '') nreads = filter(lambda n: n != '', [line[l] for l in subset_lanes]) if not nreads: nreads = [ 0, ] if min(nreads) == 0: # Add warning icon to Fastq with no reads in # at least one lane data['Fastq'] = Para(WarningIcon(), data['Fastq']) barplot = ustackedbar(nreads, length=100, height=10, colors=('grey', 'lightgrey'), bbox=True, inline=True) data['Barplot'] = Img(barplot) tbl.add_row(**data) toc_list.add_item( Link("Per-file statistics by project", per_file_stats), project_toc_list) # Set the visibility of the warning header if status: warnings.add_css_classes("hide") # Add an non-visible section that the publisher can # read to determine if there were problems s = processing_qc.add_section(name="status", css_classes=("hide", )) s.add("Status: %s" % ('OK' if status else 'WARNINGS', )) # Write the processing QC summary file processing_qc.write(html_file)