예제 #1
0
    def test_analyse_barcodes_with_bcl2fastq_dir_and_bad_samplesheet(self):
        """
        AnalyseBarcodes: raise exception for bcl2fastq directory as input using 'bad' samplesheet
        """
        # Make a mock bcl2fastq output directory
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
        datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Create "bad" sample sheet with mixture of empty and
        # non-empty indices in a lane
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB,
AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB,
CDE3,CDE3,,,,,,,CDE,
CDE4,CDE4,,,,,,,CDE,
""")
        # Set up and run pipeline
        p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
        self.assertRaises(Exception,
                          AnalyseBarcodes.run,
                          os.path.join(self.wd, "barcode_analysis"),
                          sample_sheet=sample_sheet,
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
예제 #2
0
    def main(self, args):
        """
        Internal: provides mock bcl2fastq2 functionality
        """
        # Build generic header
        header = """BCL to FASTQ file converter
bcl2fastq v2.17.1.14
Copyright (c) 2007-2015 Illumina, Inc.

2015-12-17 14:08:00 [7fa113f3f780] Command-line invocation: bcl2fastq %s""" \
    % ' '.join(args)
        # Handle version request
        if "--version" in args:
            print header
            return self._exit_code
        # Deal with arguments
        p = argparse.ArgumentParser()
        p.add_argument("--runfolder-dir", action="store")
        p.add_argument("--output-dir", action="store")
        p.add_argument("--sample-sheet", action="store")
        p.add_argument("--use-bases-mask", action="store")
        p.add_argument("--barcode-mismatches", action="store")
        p.add_argument("--minimum-trimmed-read-length", action="store")
        p.add_argument("--mask-short-adapter-reads", action="store")
        p.add_argument("--ignore-missing-bcls", action="store_true")
        p.add_argument("--no-lane-splitting", action="store_true")
        p.add_argument("-r", action="store")
        p.add_argument("-d", action="store")
        p.add_argument("-p", action="store")
        p.add_argument("-w", action="store")
        args = p.parse_args(args)
        # Check bases mask
        if self._assert_bases_mask:
            print "Checking bases mask: %s" % args.use_bases_mask
            assert (args.use_bases_mask == self._assert_bases_mask)
        # Platform
        print "Platform (default): %s" % self._platform
        # Run folder (input data)
        runfolder = args.runfolder_dir
        print "Runfolder dir: %s" % runfolder
        if runfolder is None:
            return 1
        run_info_xml = os.path.join(runfolder, "RunInfo.xml")
        if not os.path.exists(run_info_xml):
            return 1
        # Determine if run is paired end
        nreads = 0
        for r in IlluminaRunInfo(run_info_xml).reads:
            if r['is_indexed_read'] == 'N':
                nreads += 1
        if nreads == 2:
            paired_end = True
        else:
            paired_end = False
        print "Paired-end: %s" % paired_end
        # Lanes
        lanes = IlluminaRun(runfolder, platform=self._platform).lanes
        print "Lanes: %s" % lanes
        # Output folder
        output_dir = args.output_dir
        if output_dir is None:
            output_dir = "bcl2fastq"
        print "Output dir: %s" % output_dir
        # Sample sheet
        sample_sheet = args.sample_sheet
        if sample_sheet is None:
            for d in (runfolder,
                      os.path.join(runfolder, "Data", "Intensities",
                                   "BaseCalls")):
                sample_sheet = os.path.join(d, "SampleSheet.csv")
                if os.path.exists(sample_sheet):
                    break
                sample_sheet = None
        print "Sample sheet: %s" % sample_sheet
        # Modifiers
        no_lane_splitting = bool(args.no_lane_splitting)
        print "No lane splitting: %s" % no_lane_splitting
        # Generate mock output based on inputs
        tmpname = "tmp.%s" % uuid.uuid4()
        output = MockIlluminaData(name=tmpname,
                                  package="bcl2fastq2",
                                  unaligned_dir="bcl2fastq")
        missing_fastqs = self._missing_fastqs
        # Add outputs from sample sheet (if supplied)
        if sample_sheet is not None:
            s = SampleSheetPredictor(sample_sheet_file=sample_sheet)
            s.set(paired_end=paired_end,
                  no_lane_splitting=no_lane_splitting,
                  lanes=lanes)
            for project in s.projects:
                print "Adding project: %s" % project.name
                for sample in project.samples:
                    for fq in sample.fastqs():
                        if missing_fastqs and (fq in missing_fastqs):
                            continue
                        if sample.sample_name is None:
                            sample_name = sample.sample_id
                        else:
                            sample_name = sample.sample_name
                        output.add_fastq(project.name, sample_name, fq)
        # Add undetermined fastqs
        # NB Would like to use the 'add_undetermined'
        # method but this doesn't play well with using
        # the predictor-based approach above
        if paired_end:
            reads = (1, 2)
        else:
            reads = (1, )
        if no_lane_splitting:
            lanes = None
        for r in reads:
            if lanes is None:
                output.add_fastq("Undetermined_indices", "undetermined",
                                 "Undetermined_S0_R%d_001.fastq.gz" % r)
            else:
                for lane in lanes:
                    output.add_fastq(
                        "Undetermined_indices", "undetermined",
                        "Undetermined_S0_L%03d_R%d_001.fastq.gz" % (lane, r))
        # Build the output directory
        output.create()
        # Move to final location
        os.rename(os.path.join(tmpname, "bcl2fastq"), output_dir)
        shutil.rmtree(tmpname)
        return self._exit_code
예제 #3
0
 def test_analyse_barcodes_with_bcl2fastq_dir_no_samplesheet(self):
     """
     AnalyseBarcodes: bcl2fastq directory as input (no samplesheet)
     """
     # Make a mock bcl2fastq output directory
     datadir = MockIlluminaData(os.path.join(
         self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                "bcl2fastq2",
                                unaligned_dir="bcl2fastq",
                                paired_end=True)
     datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
     datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
     datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
     datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
     datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
     datadir.create()
     # Add data to Fastq files
     self._insert_fastq_reads(
         os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
     # Set up and run pipeline
     p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
         self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
     exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                       working_dir=self.wd,
                       poll_interval=POLL_INTERVAL)
     # Check outputs
     self.assertEqual(exit_code, 0)
     self.assertTrue(
         os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
         "Missing dir: barcode_analysis")
     self.assertTrue(
         os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
         "Missing dir: barcode_analysis/counts")
     for f in (
             "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
             "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
             "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts",
             "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts",
             "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
     ):
         self.assertTrue(
             os.path.isfile(
                 os.path.join(self.wd, "barcode_analysis", "counts", f)),
             "Missing file: %s" % f)
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
         "Missing file: barcodes.report")
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
         "Missing file: barcodes.xls")
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
         "Missing file: barcodes.html")
     # Check that the report content is non-trivial
     barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                    "barcodes.report")
     with open(barcodes_report, 'rt') as fp:
         contents = fp.read()
         self.assertTrue("Barcode analysis for lane #1" in contents)
         self.assertTrue(
             "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
             in contents)
         # Expect 12 lines of content in total
         self.assertEqual(contents.count('\n'), 12)
예제 #4
0
    def test_analyse_barcodes_with_samplesheet_and_10x_indices(self):
        """
        AnalyseBarcodes: sample sheet with 10xGenomics indices
        """
        # Create sample sheet
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
AB1,AB1,,,D501,SI-GA-A2,AB,
AB2,AB2,,,D501,SI-GA-B2,AB,
CDE3,CDE3,,,D501,SI-GA-C2,CDE,
CDE4,CDE4,,,D501,SI-GA-D2,CDE,
""")
        # Set up pipeline before bcl2fastq directory exists
        p = AnalyseBarcodes(sample_sheet=sample_sheet)
        # Create the bcl2fastq directory before running pipeline
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
        datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Run the pipeline
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          bcl2fastq_dir=os.path.join(
                              self.wd, "200428_M00879_0087_000000000-AGEW9",
                              "bcl2fastq"),
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        for f in (
                "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
                "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
                "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts",
                "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts",
                "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
        ):
            self.assertTrue(
                os.path.isfile(
                    os.path.join(self.wd, "barcode_analysis", "counts", f)),
                "Missing file: %s" % f)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            # Expect 12 lines of content in total
            self.assertEqual(contents.count('\n'), 12)
예제 #5
0
    def test_analyse_barcodes_with_multi_lane_samplesheet(self):
        """
        AnalyseBarcodes: multi-lane sample sheet as input
        """
        # Create sample sheet
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
1,AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB,
1,AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB,
2,CDE3,CDE3,,,D701,GACCTGAA,D501,CGTGTAGG,CDE,
2,CDE4,CDE4,,,D702,ATGTAACT,D501,CGTGTAGG,CDE,
""")
        # Set up pipeline before bcl2fastq directory exists
        p = AnalyseBarcodes(sample_sheet=sample_sheet)
        # Create the bcl2fastq directory before running pipeline
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1", lanes=(1, ))
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2", lanes=(1, ))
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3", lanes=(2, ))
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4", lanes=(2, ))
        datadir.add_fastq_batch("",
                                "Undetermined",
                                "Undetermined_S0",
                                lanes=(1, 2))
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Run the pipeline
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          bcl2fastq_dir=os.path.join(
                              self.wd, "200428_M00879_0087_000000000-AGEW9",
                              "bcl2fastq"),
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        for f in (
                "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
                "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
                "CDE.CDE3_S3_L002_R1_001.fastq.gz.counts",
                "CDE.CDE4_S4_L002_R1_001.fastq.gz.counts",
                "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
        ):
            self.assertTrue(
                os.path.isfile(
                    os.path.join(self.wd, "barcode_analysis", "counts", f)),
                "Missing file: %s" % f)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue("Barcode analysis for lane #2" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            self.assertTrue(
                "Problems detected:\n * Underrepresented samples" in contents)
            self.assertTrue(
                "   1\tTCCTGA\t\t1\t2\t100.0%\t(100.0%)" in contents)
            self.assertTrue(
                "The following samples are underrepresented:" in contents)
            for line in (
                    "AB1\tCGTGTAGG+GACCTGAA\t\t<0.1%",
                    "AB2\tCGTGTAGG+ATGTAACT\t\t<0.1%",
                    "CDE3\tGACCTGAA+CGTGTAGG\t\t<0.1%",
                    "CDE4\tATGTAACT+CGTGTAGG\t\t<0.1%",
            ):
                self.assertTrue(line in contents)
            # Expect at least 12 lines of content in total
            self.assertTrue(contents.count('\n') >= 12)
예제 #6
0
    def test_analyse_barcodes_with_bcl2fastq_dir_and_samplesheet_empty_index(
            self):
        """
        AnalyseBarcodes: bcl2fastq directory as input (with samplesheet, empty index)
        """
        # Make a mock bcl2fastq output directory
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Create sample sheet with single empty index
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
AB1,AB1,,,,,,,AB,
""")
        # Set up and run pipeline
        p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          sample_sheet=sample_sheet,
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "counts",
                             "AB.AB1_S1_L001_R1_001.fastq.gz.counts")),
            "Missing file: AB.AB1_S1_L001_R1_001.fastq.gz.counts")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            self.assertTrue(
                "Problems detected:\n * Underrepresented samples" in contents)
            self.assertTrue(
                "   1\tTCCTGA\t\t1\t1\t100.0%\t(100.0%)" in contents)
            self.assertTrue(
                "The following samples are underrepresented:" in contents)
            self.assertTrue("AB1\t\t\t<0.1%" in contents)
            # Expect at least 12 lines of content in total
            self.assertTrue(contents.count('\n') >= 12)