def test_report_single_end(self): """ report: single-end data """ analysis_dir = self._make_analysis_project(paired_end=False) project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.SE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.SE.html')))
def test_report_single_end_no_seq_lens(self): """ report: single-end data: no sequence lengths """ analysis_dir = self._make_analysis_project(protocol='standardSE', include_seqlens=False) project = AnalysisProject(analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.SE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.SE.html')))
def test_report_paired_end_with_legacy_screens(self): """ report: paired-end data with legacy screen names """ analysis_dir = self._make_analysis_project(paired_end=True, legacy_screens=True) project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_paired_end_with_no_fastq_dir(self): """ report: paired-end data with no fastq dir """ analysis_dir = self._make_analysis_project(paired_end=True, fastq_dir=".") project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_paired_end_with_non_default_qc_dir(self): """ report: paired-end data with non-default QC dir """ analysis_dir = self._make_analysis_project(paired_end=True, qc_dir="qc.non_default") project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html'), qc_dir="qc.non_default") self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_paired_end_cellranger_multi(self): """ report: paired-end data with cellranger 'multi' """ analysis_dir = self._make_analysis_project( protocol='10x_CellPlex', include_cellranger_multi=True, cellranger_multi_samples=( 'PJB_CML1', 'PJB_CML2', )) project = AnalysisProject(analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_paired_end_cellranger_count_multiome(self): """ report: paired-end data with cellranger 'count' (multiome) """ analysis_dir = self._make_analysis_project( protocol='10x_Multiome_GEX', include_cellranger_count=True, cellranger_pipelines=('cellranger-arc', ), cellranger_samples=( 'PJB1', 'PJB2', )) project = AnalysisProject(analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_single_end_with_data_dir(self): """ report: single-end data: use data directory """ analysis_dir = self._make_analysis_project(paired_end=False) project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'PJB', 'report.SE.html'), use_data_dir=True) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'PJB', 'report.SE.html'))) self.assertTrue( os.path.isdir( os.path.join(self.top_dir, 'PJB', 'report.SE_data', 'Test_PJB', 'qc'))) self.assertTrue( os.path.isdir(os.path.join(self.top_dir, 'PJB', 'report.SE_data'))) contents = os.listdir( os.path.join(self.top_dir, 'PJB', 'report.SE_data', 'Test_PJB', 'qc')) print(contents) expected = ('PJB1_S1_R1_001_fastqc.html', 'PJB1_S1_R1_001_screen_model_organisms.png', 'PJB1_S1_R1_001_screen_model_organisms.txt', 'PJB1_S1_R1_001_screen_other_organisms.png', 'PJB1_S1_R1_001_screen_other_organisms.txt', 'PJB1_S1_R1_001_screen_rRNA.png', 'PJB1_S1_R1_001_screen_rRNA.txt', 'PJB2_S2_R1_001_fastqc.html', 'PJB2_S2_R1_001_screen_model_organisms.png', 'PJB2_S2_R1_001_screen_model_organisms.txt', 'PJB2_S2_R1_001_screen_other_organisms.png', 'PJB2_S2_R1_001_screen_other_organisms.txt', 'PJB2_S2_R1_001_screen_rRNA.png', 'PJB2_S2_R1_001_screen_rRNA.txt') for f in expected: self.assertTrue(f in contents, "%s is missing from data dir" % f) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'PJB', 'report.SE_data', 'Test_PJB', 'multiqc_report.html')), "Missing multiqc_report.html")
def test_report_paired_end_with_non_canonical_fastq_names(self): """ report: paired-end data with non-canonical fastq names """ analysis_dir = self._make_analysis_project( paired_end=True, fastq_names=( "PJB1_S1_R1_001_paired.fastq.gz", "PJB1_S1_R2_001_paired.fastq.gz", "PJB2_S2_R1_001_paired.fastq.gz", "PJB2_S2_R2_001_paired.fastq.gz", )) project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.non_canonical.html')) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'report.non_canonical.html')))
def test_report_paired_end_legacy_cellranger_count(self): """ report: paired-end data with cellranger 'count' (legacy) """ analysis_dir = self._make_analysis_project( protocol='10x_scRNAseq', paired_end=True, include_cellranger_count=True, cellranger_pipelines=('cellranger', ), cellranger_samples=( 'PJB1', 'PJB2', ), legacy_cellranger_outs=True) project = AnalysisProject(analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'report.PE.html')) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'report.PE.html')))
def test_report_single_end_multiple_projects(self): """ report: single-end data: two projects in one report """ analysis_dir = self._make_analysis_project(name="PJB", paired_end=False) analysis_dir2 = self._make_analysis_project(name="PJB2", paired_end=False) project = AnalysisProject('PJB', analysis_dir) project2 = AnalysisProject('PJB2', analysis_dir2) report(( project, project2, ), title="QC report: PJB & PJB2", filename=os.path.join(self.top_dir, 'report.multiple_projects.html')) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'report.multiple_projects.html')))
def test_report_single_end_make_zip_file_with_data_dir(self): """ report: single-end data: make ZIP file with data directory """ analysis_dir = self._make_analysis_project(paired_end=False) project = AnalysisProject('PJB', analysis_dir) report((project, ), filename=os.path.join(self.top_dir, 'PJB', 'report.SE.html'), use_data_dir=True, make_zip=True) self.assertTrue( os.path.exists(os.path.join(self.top_dir, 'PJB', 'report.SE.html'))) self.assertTrue( os.path.isdir(os.path.join(self.top_dir, 'PJB', 'report.SE_data'))) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'PJB', 'report.SE.PJB.zip'))) contents = zipfile.ZipFile( os.path.join(self.top_dir, 'PJB', 'report.SE.PJB.zip')).namelist() print(contents) expected = ( 'report.SE.PJB/report.SE.html', 'report.SE.PJB/report.SE_data/Test_PJB/multiqc_report.html', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_fastqc.html', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_model_organisms.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_model_organisms.txt', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_other_organisms.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_other_organisms.txt', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_rRNA.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB1_S1_R1_001_screen_rRNA.txt', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_fastqc.html', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_model_organisms.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_model_organisms.txt', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_other_organisms.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_other_organisms.txt', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_rRNA.png', 'report.SE.PJB/report.SE_data/Test_PJB/qc/PJB2_S2_R1_001_screen_rRNA.txt' ) for f in expected: self.assertTrue(f in contents, "%s is missing from ZIP file" % f)
def test_report_single_end_multiple_projects_with_zip_file_duplicated_names_with_data_dir( self): """ report: single-end data: two projects with duplicated names in one report, with ZIP file, with data directory """ analysis_dir = self._make_analysis_project(name="PJB", paired_end=False) analysis_dir2 = self._make_analysis_project(name="PJB2", paired_end=False) project = AnalysisProject('PJB', analysis_dir) project2 = AnalysisProject('PJB2', analysis_dir2) report(( project, project2, ), title="QC report: PJB & PJB2", filename=os.path.join(self.top_dir, 'PJB', 'report.multiple_projects.html'), use_data_dir=True, make_zip=True) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'PJB', 'report.multiple_projects.html'))) self.assertTrue( os.path.exists( os.path.join(self.top_dir, 'PJB', 'report.multiple_projects.PJB.zip'))) contents = zipfile.ZipFile( os.path.join(self.top_dir, 'PJB', 'report.multiple_projects.PJB.zip')).namelist() print(contents) expected = ( 'report.multiple_projects.PJB/report.multiple_projects.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/multiqc_report.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_fastqc.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_model_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_model_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_other_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_other_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_rRNA.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB1_S1_R1_001_screen_rRNA.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_fastqc.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_model_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_model_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_other_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_other_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_rRNA.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB/qc/PJB2_S2_R1_001_screen_rRNA.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/multiqc_report.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_fastqc.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_model_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_model_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_other_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_other_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_rRNA.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB1_S1_R1_001_screen_rRNA.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_fastqc.html', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_model_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_model_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_other_organisms.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_other_organisms.txt', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_rRNA.png', 'report.multiple_projects.PJB/report.multiple_projects_data/Test_PJB2/qc/PJB2_S2_R1_001_screen_rRNA.txt' ) for f in expected: self.assertTrue(f in contents, "%s is missing from ZIP file" % f)
def main(): # Deal with command line p = argparse.ArgumentParser(description="Generate QC report for each " "directory DIR") p.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__) p.add_argument('--protocol', action='store', dest='qc_protocol', default=None, help="explicitly specify QC protocol (must be one of " "%s). Default is to determine the protocol " "automatically (recommended)" % str(','.join(["'%s'" % pr for pr in PROTOCOLS]))) p.add_argument('--qc_dir', action='store', dest='qc_dir', default='qc', help="explicitly specify QC output directory (nb if " "supplied then the same QC_DIR will be used for each " "DIR. Non-absolute paths are assumed to be relative to " "DIR). Default: 'qc'") p.add_argument('--fastq_dir', action='store', dest='fastq_dir', default=None, help="explicitly specify subdirectory of DIRs with " "Fastq files to run the QC on") reporting = p.add_argument_group('Reporting options') reporting.add_argument('-t', '--title', action='store', dest='title', default=None, help="title for output QC reports") reporting.add_argument('-f', '--filename', action='store', dest='filename', default=None, help="file name for output HTML QC report " "(default: <DIR>/<QC_DIR>_report.html)") reporting.add_argument('--zip', action='store_true', dest='zip', default=False, help="make ZIP archive for the QC report") reporting.add_argument('--multiqc', action='store_true', dest='multiqc', default=False, help="generate MultiQC report") reporting.add_argument('--force', action='store_true', dest='force', default=False, help="force generation of reports even if " "verification fails") data_dir_group = reporting.add_mutually_exclusive_group() data_dir_group.add_argument('--data-dir', action='store_true', dest='use_data_dir', help="create a data directory with copies " "of QC artefacts needed for the HTML " "report (NB data directory will always " "be created for multi-project reports, " "unless --no-data-dir is specified)") data_dir_group.add_argument('--no-data-dir', action='store_true', dest='no_data_dir', help="don't a data directory with copies " "of QC artefacts (this is the default " "except for multi-project reports)") verification = p.add_argument_group('Verification options') verification.add_argument('--verify', action='store_true', dest='verify', help="verify the QC products only (don't " "write the report); returns exit code 0 " "if QC is verified, 1 if not") deprecated = p.add_argument_group('Deprecated options') deprecated.add_argument('-l', '--list-unverified', action='store_true', dest='list_unverified', default=False, help="deprecated: does nothing (Fastqs with " "missing QC outputs can no longer be listed)") deprecated.add_argument('--strand_stats', action='store_true', dest='fastq_strand', default=False, help="deprecated: does nothing (strand stats " "are automatically included if present)") p.add_argument('dirs', metavar="DIR", nargs='+', help="directory to report QC for; can be a project " "directory (in which case the default QC directory " "will be reported), or a QC directory within a " "project") args = p.parse_args() # Report name and version print("%s version %s" % (os.path.basename(sys.argv[0]), __version__)) # Report arguments if sys.argv[1:]: print("\n%s" % ' '.join( ['"%s"' % arg if ' ' in arg else arg for arg in sys.argv[1:]])) # Report working directory print("\nCWD %s" % os.getcwd()) # Check for MultiQC if required if args.multiqc: if find_program("multiqc") is None: logging.critical("MultiQC report requested but 'multiqc' " "not available") sys.exit(1) # Get projects and QC dirs from supplied directories projects = [] for d in args.dirs: print("\n**** Examining directory %s ****" % d) # Check if directory is a QC dir qc_dir = None # Look for 'qc.info' in current directory if os.path.exists(os.path.join(os.path.abspath(d), 'qc.info')): print("...located 'qc.info', assuming this is QC dir") qc_dir = os.path.abspath(d) # Locate parent project dir metadata_file = locate_project_info_file(qc_dir) if metadata_file is not None: p = AnalysisProject(os.path.dirname(metadata_file)) print("...located parent project: %s" % p.dirn) else: # Unable to locate project directory print("...failed to locate parent project metadata file") # Fall back to location of Fastq files qc_info = AnalysisProjectQCDirInfo( os.path.join(qc_dir, 'qc.info')) if qc_info.fastq_dir is not None: project_dir = os.path.abspath(qc_info.fastq_dir) if os.path.basename(project_dir).startswith('fastqs'): # Use the next level up project_dir = os.path.dirname(project_dir) print("...putative parent project dir: %s (from " " Fastq dir)" % project_dir) p = AnalysisProject(project_dir) else: # Failed to locate Fastqs logging.fatal("Unable to locate parent project") # Exit with an error sys.exit(1) # Issue a warning if a QC dir was explicitly # specified on the command line if args.qc_dir is not None: logging.warning("--qc_dir has been ignored for this " "directory") else: # Assume directory is a project p = AnalysisProject(os.path.abspath(d)) print("...assuming this is a project dir") # Identify the QC directory if args.qc_dir is None: qc_dir = p.qc_dir else: qc_dir = args.qc_dir if not os.path.isabs(qc_dir): qc_dir = os.path.join(p.dirn, qc_dir) print("...QC directory: %s" % qc_dir) # Explicitly set the QC directory location) p.use_qc_dir(qc_dir) # Locate the Fastq dir qc_info = p.qc_info(qc_dir) if args.fastq_dir is None: fastq_dir = qc_info.fastq_dir if fastq_dir is None: fastq_dir = p.fastq_dir else: fastq_dir = args.fastq_dir if qc_info.fastq_dir is not None: if os.path.join(p.dirn, qc_info.fastq_dir) != fastq_dir: logging.warning("Stored fastq dir mismatch " "(%s != %s)" % (fastq_dir, qc_info.fastq_dir)) print("...using Fastqs dir: %s" % p.fastq_dir) p.use_fastq_dir(fastq_dir, strict=False) projects.append(p) # Verify QC for projects print("\n**** Verifying QC ****") retval = 0 report_projects = [] for p in projects: print("\nProject: %s" % p.name) print("-" * (len('Project: ') + len(p.name))) print("%d sample%s | %d fastq%s" % ( len(p.samples), 's' if len(p.samples) != 1 else '', len(p.fastqs), 's' if len(p.fastqs) != 1 else '', )) # QC metadata qc_dir = p.qc_dir qc_info = p.qc_info(qc_dir) # Set QC protocol for verification if args.qc_protocol is None: protocol = qc_info.protocol if protocol is None: protocol = determine_qc_protocol(p) else: protocol = args.qc_protocol print("Verifying against QC protocol '%s'" % protocol) # Verification step if len(p.fastqs) == 0: logging.critical("No Fastqs!") verified = False else: try: verified = verify_project(p, qc_dir, protocol) except Exception as ex: logging.critical("Error: %s" % ex) verified = False if not verified: print("Verification: FAILED") if not args.force: retval = 1 continue else: print("--force specified, ignoring previous errors") else: print("Verification: OK") if args.verify: continue report_projects.append(p) # Generate QC report if report_projects: # Set defaults from primary project p = report_projects[0] qc_base = os.path.basename(p.qc_dir) # Filename and location for report if args.filename is None: out_file = '%s_report.html' % qc_base else: out_file = args.filename if not os.path.isabs(out_file): out_file = os.path.join(p.dirn, out_file) out_dir = os.path.dirname(out_file) # MultiQC report if args.multiqc: multiqc_report = os.path.join(out_dir, "multi%s_report.html" % qc_base) # Check if we need to rerun MultiQC if os.path.exists(multiqc_report) and not args.force: run_multiqc = False for p in report_projects: multiqc_mtime = os.path.getmtime(multiqc_report) for f in os.listdir(p.qc_dir): if os.path.getmtime(os.path.join(p.qc_dir,f)) > \ multiqc_mtime: # Input is newer than report run_multiqc = True break else: run_multiqc = True # (Re)run MultiQC if run_multiqc: multiqc_cmd = Command('multiqc', '--title', '%s' % args.title, '--filename', '%s' % multiqc_report, '--force') for p in report_projects: multiqc_cmd.add_args(p.qc_dir) print("\nRunning %s" % multiqc_cmd) multiqc_retval = multiqc_cmd.run_subprocess() if multiqc_retval == 0 and os.path.exists(multiqc_report): print("MultiQC: %s\n" % multiqc_report) else: print("MultiQC: FAILED") retval += 1 else: print("MultiQC: %s (already exists)\n" % multiqc_report) # Create data directory? use_data_dir = (len(projects) > 1) if args.use_data_dir: use_data_dir = True elif args.no_data_dir: use_data_dir = False # Generate report report_html = report(report_projects, title=args.title, filename=out_file, relative_links=True, use_data_dir=use_data_dir, make_zip=args.zip) print("Wrote QC report to %s" % out_file) # Finish with appropriate exit code print("%s completed: exit code %s (%s)" % (os.path.basename(sys.argv[0]), retval, ('ok' if retval == 0 else 'error'))) sys.exit(retval)