def test_analyse_barcodes_with_bcl2fastq_dir_and_bad_samplesheet(self): """ AnalyseBarcodes: raise exception for bcl2fastq directory as input using 'bad' samplesheet """ # Make a mock bcl2fastq output directory datadir = MockIlluminaData(os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9"), "bcl2fastq2", unaligned_dir="bcl2fastq", paired_end=True) datadir.add_fastq_batch("AB", "AB1", "AB1_S1") datadir.add_fastq_batch("AB", "AB2", "AB2_S2") datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3") datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4") datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0") datadir.create() # Add data to Fastq files self._insert_fastq_reads( os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9")) # Create "bad" sample sheet with mixture of empty and # non-empty indices in a lane sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv") with open(sample_sheet, 'w') as fp: fp.write("""[Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB, AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB, CDE3,CDE3,,,,,,,CDE, CDE4,CDE4,,,,,,,CDE, """) # Set up and run pipeline p = AnalyseBarcodes(bcl2fastq_dir=os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq")) self.assertRaises(Exception, AnalyseBarcodes.run, os.path.join(self.wd, "barcode_analysis"), sample_sheet=sample_sheet, working_dir=self.wd, poll_interval=POLL_INTERVAL)
def main(self, args): """ Internal: provides mock bcl2fastq2 functionality """ # Build generic header header = """BCL to FASTQ file converter bcl2fastq v2.17.1.14 Copyright (c) 2007-2015 Illumina, Inc. 2015-12-17 14:08:00 [7fa113f3f780] Command-line invocation: bcl2fastq %s""" \ % ' '.join(args) # Handle version request if "--version" in args: print header return self._exit_code # Deal with arguments p = argparse.ArgumentParser() p.add_argument("--runfolder-dir", action="store") p.add_argument("--output-dir", action="store") p.add_argument("--sample-sheet", action="store") p.add_argument("--use-bases-mask", action="store") p.add_argument("--barcode-mismatches", action="store") p.add_argument("--minimum-trimmed-read-length", action="store") p.add_argument("--mask-short-adapter-reads", action="store") p.add_argument("--ignore-missing-bcls", action="store_true") p.add_argument("--no-lane-splitting", action="store_true") p.add_argument("-r", action="store") p.add_argument("-d", action="store") p.add_argument("-p", action="store") p.add_argument("-w", action="store") args = p.parse_args(args) # Check bases mask if self._assert_bases_mask: print "Checking bases mask: %s" % args.use_bases_mask assert (args.use_bases_mask == self._assert_bases_mask) # Platform print "Platform (default): %s" % self._platform # Run folder (input data) runfolder = args.runfolder_dir print "Runfolder dir: %s" % runfolder if runfolder is None: return 1 run_info_xml = os.path.join(runfolder, "RunInfo.xml") if not os.path.exists(run_info_xml): return 1 # Determine if run is paired end nreads = 0 for r in IlluminaRunInfo(run_info_xml).reads: if r['is_indexed_read'] == 'N': nreads += 1 if nreads == 2: paired_end = True else: paired_end = False print "Paired-end: %s" % paired_end # Lanes lanes = IlluminaRun(runfolder, platform=self._platform).lanes print "Lanes: %s" % lanes # Output folder output_dir = args.output_dir if output_dir is None: output_dir = "bcl2fastq" print "Output dir: %s" % output_dir # Sample sheet sample_sheet = args.sample_sheet if sample_sheet is None: for d in (runfolder, os.path.join(runfolder, "Data", "Intensities", "BaseCalls")): sample_sheet = os.path.join(d, "SampleSheet.csv") if os.path.exists(sample_sheet): break sample_sheet = None print "Sample sheet: %s" % sample_sheet # Modifiers no_lane_splitting = bool(args.no_lane_splitting) print "No lane splitting: %s" % no_lane_splitting # Generate mock output based on inputs tmpname = "tmp.%s" % uuid.uuid4() output = MockIlluminaData(name=tmpname, package="bcl2fastq2", unaligned_dir="bcl2fastq") missing_fastqs = self._missing_fastqs # Add outputs from sample sheet (if supplied) if sample_sheet is not None: s = SampleSheetPredictor(sample_sheet_file=sample_sheet) s.set(paired_end=paired_end, no_lane_splitting=no_lane_splitting, lanes=lanes) for project in s.projects: print "Adding project: %s" % project.name for sample in project.samples: for fq in sample.fastqs(): if missing_fastqs and (fq in missing_fastqs): continue if sample.sample_name is None: sample_name = sample.sample_id else: sample_name = sample.sample_name output.add_fastq(project.name, sample_name, fq) # Add undetermined fastqs # NB Would like to use the 'add_undetermined' # method but this doesn't play well with using # the predictor-based approach above if paired_end: reads = (1, 2) else: reads = (1, ) if no_lane_splitting: lanes = None for r in reads: if lanes is None: output.add_fastq("Undetermined_indices", "undetermined", "Undetermined_S0_R%d_001.fastq.gz" % r) else: for lane in lanes: output.add_fastq( "Undetermined_indices", "undetermined", "Undetermined_S0_L%03d_R%d_001.fastq.gz" % (lane, r)) # Build the output directory output.create() # Move to final location os.rename(os.path.join(tmpname, "bcl2fastq"), output_dir) shutil.rmtree(tmpname) return self._exit_code
def test_analyse_barcodes_with_bcl2fastq_dir_no_samplesheet(self): """ AnalyseBarcodes: bcl2fastq directory as input (no samplesheet) """ # Make a mock bcl2fastq output directory datadir = MockIlluminaData(os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9"), "bcl2fastq2", unaligned_dir="bcl2fastq", paired_end=True) datadir.add_fastq_batch("AB", "AB1", "AB1_S1") datadir.add_fastq_batch("AB", "AB2", "AB2_S2") datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3") datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4") datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0") datadir.create() # Add data to Fastq files self._insert_fastq_reads( os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9")) # Set up and run pipeline p = AnalyseBarcodes(bcl2fastq_dir=os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq")) exit_code = p.run(os.path.join(self.wd, "barcode_analysis"), working_dir=self.wd, poll_interval=POLL_INTERVAL) # Check outputs self.assertEqual(exit_code, 0) self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis")), "Missing dir: barcode_analysis") self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")), "Missing dir: barcode_analysis/counts") for f in ( "AB.AB1_S1_L001_R1_001.fastq.gz.counts", "AB.AB2_S2_L001_R1_001.fastq.gz.counts", "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts", "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts", "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts" ): self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "counts", f)), "Missing file: %s" % f) self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.report")), "Missing file: barcodes.report") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.xls")), "Missing file: barcodes.xls") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.html")), "Missing file: barcodes.html") # Check that the report content is non-trivial barcodes_report = os.path.join(self.wd, "barcode_analysis", "barcodes.report") with open(barcodes_report, 'rt') as fp: contents = fp.read() self.assertTrue("Barcode analysis for lane #1" in contents) self.assertTrue( "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)" in contents) # Expect 12 lines of content in total self.assertEqual(contents.count('\n'), 12)
def test_analyse_barcodes_with_samplesheet_and_10x_indices(self): """ AnalyseBarcodes: sample sheet with 10xGenomics indices """ # Create sample sheet sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv") with open(sample_sheet, 'w') as fp: fp.write("""[Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description AB1,AB1,,,D501,SI-GA-A2,AB, AB2,AB2,,,D501,SI-GA-B2,AB, CDE3,CDE3,,,D501,SI-GA-C2,CDE, CDE4,CDE4,,,D501,SI-GA-D2,CDE, """) # Set up pipeline before bcl2fastq directory exists p = AnalyseBarcodes(sample_sheet=sample_sheet) # Create the bcl2fastq directory before running pipeline datadir = MockIlluminaData(os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9"), "bcl2fastq2", unaligned_dir="bcl2fastq", paired_end=True) datadir.add_fastq_batch("AB", "AB1", "AB1_S1") datadir.add_fastq_batch("AB", "AB2", "AB2_S2") datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3") datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4") datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0") datadir.create() # Add data to Fastq files self._insert_fastq_reads( os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9")) # Run the pipeline exit_code = p.run(os.path.join(self.wd, "barcode_analysis"), bcl2fastq_dir=os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"), working_dir=self.wd, poll_interval=POLL_INTERVAL) # Check outputs self.assertEqual(exit_code, 0) self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis")), "Missing dir: barcode_analysis") self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")), "Missing dir: barcode_analysis/counts") for f in ( "AB.AB1_S1_L001_R1_001.fastq.gz.counts", "AB.AB2_S2_L001_R1_001.fastq.gz.counts", "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts", "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts", "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts" ): self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "counts", f)), "Missing file: %s" % f) self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.report")), "Missing file: barcodes.report") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.xls")), "Missing file: barcodes.xls") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.html")), "Missing file: barcodes.html") # Check that the report content is non-trivial barcodes_report = os.path.join(self.wd, "barcode_analysis", "barcodes.report") with open(barcodes_report, 'rt') as fp: contents = fp.read() self.assertTrue("Barcode analysis for lane #1" in contents) self.assertTrue( "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)" in contents) # Expect 12 lines of content in total self.assertEqual(contents.count('\n'), 12)
def test_analyse_barcodes_with_multi_lane_samplesheet(self): """ AnalyseBarcodes: multi-lane sample sheet as input """ # Create sample sheet sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv") with open(sample_sheet, 'w') as fp: fp.write("""[Data] Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description 1,AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB, 1,AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB, 2,CDE3,CDE3,,,D701,GACCTGAA,D501,CGTGTAGG,CDE, 2,CDE4,CDE4,,,D702,ATGTAACT,D501,CGTGTAGG,CDE, """) # Set up pipeline before bcl2fastq directory exists p = AnalyseBarcodes(sample_sheet=sample_sheet) # Create the bcl2fastq directory before running pipeline datadir = MockIlluminaData(os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9"), "bcl2fastq2", unaligned_dir="bcl2fastq", paired_end=True) datadir.add_fastq_batch("AB", "AB1", "AB1_S1", lanes=(1, )) datadir.add_fastq_batch("AB", "AB2", "AB2_S2", lanes=(1, )) datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3", lanes=(2, )) datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4", lanes=(2, )) datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0", lanes=(1, 2)) datadir.create() # Add data to Fastq files self._insert_fastq_reads( os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9")) # Run the pipeline exit_code = p.run(os.path.join(self.wd, "barcode_analysis"), bcl2fastq_dir=os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"), working_dir=self.wd, poll_interval=POLL_INTERVAL) # Check outputs self.assertEqual(exit_code, 0) self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis")), "Missing dir: barcode_analysis") self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")), "Missing dir: barcode_analysis/counts") for f in ( "AB.AB1_S1_L001_R1_001.fastq.gz.counts", "AB.AB2_S2_L001_R1_001.fastq.gz.counts", "CDE.CDE3_S3_L002_R1_001.fastq.gz.counts", "CDE.CDE4_S4_L002_R1_001.fastq.gz.counts", "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts" ): self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "counts", f)), "Missing file: %s" % f) self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.report")), "Missing file: barcodes.report") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.xls")), "Missing file: barcodes.xls") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.html")), "Missing file: barcodes.html") # Check that the report content is non-trivial barcodes_report = os.path.join(self.wd, "barcode_analysis", "barcodes.report") with open(barcodes_report, 'rt') as fp: contents = fp.read() self.assertTrue("Barcode analysis for lane #1" in contents) self.assertTrue("Barcode analysis for lane #2" in contents) self.assertTrue( "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)" in contents) self.assertTrue( "Problems detected:\n * Underrepresented samples" in contents) self.assertTrue( " 1\tTCCTGA\t\t1\t2\t100.0%\t(100.0%)" in contents) self.assertTrue( "The following samples are underrepresented:" in contents) for line in ( "AB1\tCGTGTAGG+GACCTGAA\t\t<0.1%", "AB2\tCGTGTAGG+ATGTAACT\t\t<0.1%", "CDE3\tGACCTGAA+CGTGTAGG\t\t<0.1%", "CDE4\tATGTAACT+CGTGTAGG\t\t<0.1%", ): self.assertTrue(line in contents) # Expect at least 12 lines of content in total self.assertTrue(contents.count('\n') >= 12)
def test_analyse_barcodes_with_bcl2fastq_dir_and_samplesheet_empty_index( self): """ AnalyseBarcodes: bcl2fastq directory as input (with samplesheet, empty index) """ # Make a mock bcl2fastq output directory datadir = MockIlluminaData(os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9"), "bcl2fastq2", unaligned_dir="bcl2fastq", paired_end=True) datadir.add_fastq_batch("AB", "AB1", "AB1_S1") datadir.create() # Add data to Fastq files self._insert_fastq_reads( os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9")) # Create sample sheet with single empty index sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv") with open(sample_sheet, 'w') as fp: fp.write("""[Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description AB1,AB1,,,,,,,AB, """) # Set up and run pipeline p = AnalyseBarcodes(bcl2fastq_dir=os.path.join( self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq")) exit_code = p.run(os.path.join(self.wd, "barcode_analysis"), sample_sheet=sample_sheet, working_dir=self.wd, poll_interval=POLL_INTERVAL) # Check outputs self.assertEqual(exit_code, 0) self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis")), "Missing dir: barcode_analysis") self.assertTrue( os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")), "Missing dir: barcode_analysis/counts") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "counts", "AB.AB1_S1_L001_R1_001.fastq.gz.counts")), "Missing file: AB.AB1_S1_L001_R1_001.fastq.gz.counts") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.report")), "Missing file: barcodes.report") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.xls")), "Missing file: barcodes.xls") self.assertTrue( os.path.isfile( os.path.join(self.wd, "barcode_analysis", "barcodes.html")), "Missing file: barcodes.html") # Check that the report content is non-trivial barcodes_report = os.path.join(self.wd, "barcode_analysis", "barcodes.report") with open(barcodes_report, 'rt') as fp: contents = fp.read() self.assertTrue("Barcode analysis for lane #1" in contents) self.assertTrue( "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)" in contents) self.assertTrue( "Problems detected:\n * Underrepresented samples" in contents) self.assertTrue( " 1\tTCCTGA\t\t1\t1\t100.0%\t(100.0%)" in contents) self.assertTrue( "The following samples are underrepresented:" in contents) self.assertTrue("AB1\t\t\t<0.1%" in contents) # Expect at least 12 lines of content in total self.assertTrue(contents.count('\n') >= 12)