def __init__(self,
              run_name,
              platform,
              unaligned_dir='bcl2fastq',
              fmt='bcl2fastq2',
              paired_end=True,
              no_lane_splitting=False,
              no_undetermined=False,
              top_dir=None,
              metadata=None,
              readme=None):
     # Make a mock-up of an analysis dir
     self.run_name = os.path.basename(str(run_name))
     self.platform = str(platform).lower()
     self.readme = readme
     # Store metadata
     self.metadata = { 'run_name': self.run_name,
                       'platform': self.platform, }
     if metadata is not None:
         for item in metadata:
             self.metadata[item] = metadata[item]
     name = "%s_analysis" % run_name
     if top_dir is None:
         top_dir = os.getcwd()
     MockIlluminaData.__init__(self,name,fmt,
                               unaligned_dir=unaligned_dir,
                               paired_end=paired_end,
                               no_lane_splitting=no_lane_splitting,
                               top_dir=top_dir)
     # Add undetermined
     if not no_undetermined:
         self.add_undetermined()
Exemple #2
0
    def test_analyse_barcodes_with_bcl2fastq_dir_and_bad_samplesheet(self):
        """
        AnalyseBarcodes: raise exception for bcl2fastq directory as input using 'bad' samplesheet
        """
        # Make a mock bcl2fastq output directory
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
        datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Create "bad" sample sheet with mixture of empty and
        # non-empty indices in a lane
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB,
AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB,
CDE3,CDE3,,,,,,,CDE,
CDE4,CDE4,,,,,,,CDE,
""")
        # Set up and run pipeline
        p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
        self.assertRaises(Exception,
                          AnalyseBarcodes.run,
                          os.path.join(self.wd, "barcode_analysis"),
                          sample_sheet=sample_sheet,
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
Exemple #3
0
 def __init__(self,
              run_name,
              platform,
              unaligned_dir='bcl2fastq',
              fmt='bcl2fastq2',
              paired_end=True,
              lanes=None,
              no_lane_splitting=False,
              no_undetermined=False,
              top_dir=None,
              metadata=None,
              readme=None):
     # Make a mock-up of an analysis dir
     self.run_name = os.path.basename(str(run_name))
     self.platform = str(platform).lower()
     self.readme = readme
     # Store metadata
     self.metadata = {
         'run_name': self.run_name,
         'platform': self.platform,
     }
     if metadata is not None:
         for item in metadata:
             self.metadata[item] = metadata[item]
     name = "%s_analysis" % run_name
     if top_dir is None:
         top_dir = os.getcwd()
     MockIlluminaData.__init__(self,
                               name,
                               fmt,
                               unaligned_dir=unaligned_dir,
                               paired_end=paired_end,
                               no_lane_splitting=no_lane_splitting,
                               top_dir=top_dir)
     # Add undetermined
     if not no_undetermined:
         self.add_undetermined(lanes=lanes)
    def create(self):
        """
        Build and populate the directory structure

        Creates the directory structure on disk which has been defined
        within the MockAnalysisDir object.

        Invoke the 'remove' method to delete the directory structure.

        The contents of the MockAnalysisDir object can be modified
        after the directory structure has been created, but changes will
        not be reflected on disk. Instead it is necessary to first
        remove the directory structure, and then re-invoke the create
        method.

        create raises an OSError exception if any part of the directory
        structure already exists.

        """
        MockIlluminaData.create(self)
        # Add (empty) metadata file
        with open(os.path.join(self.dirn,'metadata.info'),'w') as fp:
            if self.metadata is not None:
                for item in self.metadata:
                    fp.write("%s\t%s\n" % (item,self.metadata[item]))
            else:
                fp.write('')
        # Add auto_process.info file
        with open(os.path.join(self.dirn,'auto_process.info'),'w') as fp:
            fp.write("analysis_dir\t%s\n" % os.path.basename(self.dirn))
            fp.write("bases_mask\ty76,I8,I8,y76\n")
            fp.write("data_dir\t/mnt/data/%s\n" % self.run_name)
            fp.write("per_lane_stats_file\tper_lane_statistics.info\n")
            fp.write("primary_data_dir\t%s/primary_data/%s\n" % (self.dirn,
                                                                 self.run_name))
            fp.write("project_metadata\tprojects.info\n")
            fp.write("sample_sheet\t%s/custom_SampleSheet.csv\n" % self.dirn)
            fp.write("stats_file\tstatistics.info\n")
            fp.write("unaligned_dir\tbcl2fastq\n")
        # Add top-level README file
        if self.readme is not None:
            open(os.path.join(self.dirn,'README'),'w').write(self.readme)
        # Add empty original sample sheet
        open(os.path.join(self.dirn,'SampleSheet.orig.csv'),'w').write('')
        # Initialise a custom_SampleSheet.csv
        with open(os.path.join(self.dirn,'custom_SampleSheet.csv'),'w') as fp:
            fp.write('[Data]\n')
            fp.write('Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description\n')
        # Add top-level ScriptCode directory
        os.mkdir(os.path.join(self.dirn,'ScriptCode'))
        # Add top-level logs directory
        os.mkdir(os.path.join(self.dirn,'logs'))
        # Add project dirs
        projects_info = open(os.path.join(self.dirn,'projects.info'),'w')
        projects_info.write('#%s\n' % '\t'.join(('Project',
                                                 'Samples',
                                                 'User',
                                                 'Library',
                                                 'Organism',
                                                 'PI',
                                                 'Comments')))
        for project in self.projects:
            if project.startswith("Undetermined"):
                project_name = 'undetermined'
            else:
                project_name = project
            project_dir = MockAnalysisProject(project_name)
            sample_names = []
            for sample in self.samples_in_project(project):
                sample_names.append(sample)
                for fq in self.fastqs_in_sample(project,sample):
                    project_dir.add_fastq(fq)
            # Add line to projects.info
            if project_name != 'undetermined':
                projects_info.write('%s\n' % '\t'.join((project,
                                                        ','.join(sample_names),
                                                        '.',
                                                        '.',
                                                        '.',
                                                        '.',
                                                        '.')))
                # Add lines to custom_SampleSheet
                with open(os.path.join(self.dirn,'custom_SampleSheet.csv'),
                          'a') as fp:
                    for sample in self.samples_in_project(project):
                        fp.write('%s,,,,,,%s,\n' % (sample,
                                                    project_name))
            # Write the project directory to disk
            project_dir.create(top_dir=self.dirn)
        # Finished
        return self.dirn
Exemple #5
0
    def main(self, args):
        """
        Internal: provides mock bcl2fastq2 functionality
        """
        # Build generic header
        header = """BCL to FASTQ file converter
bcl2fastq v2.17.1.14
Copyright (c) 2007-2015 Illumina, Inc.

2015-12-17 14:08:00 [7fa113f3f780] Command-line invocation: bcl2fastq %s""" \
    % ' '.join(args)
        # Handle version request
        if "--version" in args:
            print header
            return self._exit_code
        # Deal with arguments
        p = argparse.ArgumentParser()
        p.add_argument("--runfolder-dir", action="store")
        p.add_argument("--output-dir", action="store")
        p.add_argument("--sample-sheet", action="store")
        p.add_argument("--use-bases-mask", action="store")
        p.add_argument("--barcode-mismatches", action="store")
        p.add_argument("--minimum-trimmed-read-length", action="store")
        p.add_argument("--mask-short-adapter-reads", action="store")
        p.add_argument("--ignore-missing-bcls", action="store_true")
        p.add_argument("--no-lane-splitting", action="store_true")
        p.add_argument("-r", action="store")
        p.add_argument("-d", action="store")
        p.add_argument("-p", action="store")
        p.add_argument("-w", action="store")
        args = p.parse_args(args)
        # Check bases mask
        if self._assert_bases_mask:
            print "Checking bases mask: %s" % args.use_bases_mask
            assert (args.use_bases_mask == self._assert_bases_mask)
        # Platform
        print "Platform (default): %s" % self._platform
        # Run folder (input data)
        runfolder = args.runfolder_dir
        print "Runfolder dir: %s" % runfolder
        if runfolder is None:
            return 1
        run_info_xml = os.path.join(runfolder, "RunInfo.xml")
        if not os.path.exists(run_info_xml):
            return 1
        # Determine if run is paired end
        nreads = 0
        for r in IlluminaRunInfo(run_info_xml).reads:
            if r['is_indexed_read'] == 'N':
                nreads += 1
        if nreads == 2:
            paired_end = True
        else:
            paired_end = False
        print "Paired-end: %s" % paired_end
        # Lanes
        lanes = IlluminaRun(runfolder, platform=self._platform).lanes
        print "Lanes: %s" % lanes
        # Output folder
        output_dir = args.output_dir
        if output_dir is None:
            output_dir = "bcl2fastq"
        print "Output dir: %s" % output_dir
        # Sample sheet
        sample_sheet = args.sample_sheet
        if sample_sheet is None:
            for d in (runfolder,
                      os.path.join(runfolder, "Data", "Intensities",
                                   "BaseCalls")):
                sample_sheet = os.path.join(d, "SampleSheet.csv")
                if os.path.exists(sample_sheet):
                    break
                sample_sheet = None
        print "Sample sheet: %s" % sample_sheet
        # Modifiers
        no_lane_splitting = bool(args.no_lane_splitting)
        print "No lane splitting: %s" % no_lane_splitting
        # Generate mock output based on inputs
        tmpname = "tmp.%s" % uuid.uuid4()
        output = MockIlluminaData(name=tmpname,
                                  package="bcl2fastq2",
                                  unaligned_dir="bcl2fastq")
        missing_fastqs = self._missing_fastqs
        # Add outputs from sample sheet (if supplied)
        if sample_sheet is not None:
            s = SampleSheetPredictor(sample_sheet_file=sample_sheet)
            s.set(paired_end=paired_end,
                  no_lane_splitting=no_lane_splitting,
                  lanes=lanes)
            for project in s.projects:
                print "Adding project: %s" % project.name
                for sample in project.samples:
                    for fq in sample.fastqs():
                        if missing_fastqs and (fq in missing_fastqs):
                            continue
                        if sample.sample_name is None:
                            sample_name = sample.sample_id
                        else:
                            sample_name = sample.sample_name
                        output.add_fastq(project.name, sample_name, fq)
        # Add undetermined fastqs
        # NB Would like to use the 'add_undetermined'
        # method but this doesn't play well with using
        # the predictor-based approach above
        if paired_end:
            reads = (1, 2)
        else:
            reads = (1, )
        if no_lane_splitting:
            lanes = None
        for r in reads:
            if lanes is None:
                output.add_fastq("Undetermined_indices", "undetermined",
                                 "Undetermined_S0_R%d_001.fastq.gz" % r)
            else:
                for lane in lanes:
                    output.add_fastq(
                        "Undetermined_indices", "undetermined",
                        "Undetermined_S0_L%03d_R%d_001.fastq.gz" % (lane, r))
        # Build the output directory
        output.create()
        # Move to final location
        os.rename(os.path.join(tmpname, "bcl2fastq"), output_dir)
        shutil.rmtree(tmpname)
        return self._exit_code
Exemple #6
0
    def create(self, no_project_dirs=False):
        """
        Build and populate the directory structure

        Creates the directory structure on disk which has been defined
        within the MockAnalysisDir object.

        Invoke the 'remove' method to delete the directory structure.

        The contents of the MockAnalysisDir object can be modified
        after the directory structure has been created, but changes will
        not be reflected on disk. Instead it is necessary to first
        remove the directory structure, and then re-invoke the create
        method.

        'create' raises an OSError exception if any part of the
        directory structure already exists.

        Arguments:
          no_project_dirs (bool): if False then don't create
            analysis project subdirectories (these are created by
            default)
        """
        MockIlluminaData.create(self)
        # Add (empty) metadata file
        with open(os.path.join(self.dirn, 'metadata.info'), 'w') as fp:
            if self.metadata is not None:
                for item in self.metadata:
                    fp.write("%s\t%s\n" % (item, self.metadata[item]))
            else:
                fp.write('')
        # Add auto_process.info file
        with open(os.path.join(self.dirn, 'auto_process.info'), 'w') as fp:
            fp.write("analysis_dir\t%s\n" % os.path.basename(self.dirn))
            fp.write("bases_mask\ty76,I8,I8,y76\n")
            fp.write("data_dir\t/mnt/data/%s\n" % self.run_name)
            fp.write("per_lane_stats_file\tper_lane_statistics.info\n")
            fp.write("primary_data_dir\t%s/primary_data/%s\n" %
                     (self.dirn, self.run_name))
            fp.write("project_metadata\tprojects.info\n")
            fp.write("sample_sheet\t%s/custom_SampleSheet.csv\n" % self.dirn)
            fp.write("stats_file\tstatistics.info\n")
            fp.write("unaligned_dir\tbcl2fastq\n")
        # Add top-level README file
        if self.readme is not None:
            open(os.path.join(self.dirn, 'README'), 'w').write(self.readme)
        # Add empty original sample sheet
        open(os.path.join(self.dirn, 'SampleSheet.orig.csv'), 'w').write('')
        # Initialise a custom_SampleSheet.csv
        with open(os.path.join(self.dirn, 'custom_SampleSheet.csv'),
                  'w') as fp:
            fp.write('[Data]\n')
            fp.write(
                'Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description\n'
            )
        # Add top-level ScriptCode directory
        os.mkdir(os.path.join(self.dirn, 'ScriptCode'))
        # Add top-level logs directory
        os.mkdir(os.path.join(self.dirn, 'logs'))
        # Add project dirs
        projects_info = open(os.path.join(self.dirn, 'projects.info'), 'w')
        projects_info.write('#%s\n' % '\t'.join(
            ('Project', 'Samples', 'User', 'Library', 'Organism', 'PI',
             'Comments')))
        for project in self.projects:
            if project.startswith("Undetermined"):
                project_name = 'undetermined'
            else:
                project_name = project
            try:
                project_metadata = self.project_metadata[project_name]
            except KeyError:
                project_metadata = dict()
            project_dir = MockAnalysisProject(project_name,
                                              metadata=project_metadata)
            sample_names = []
            for sample in self.samples_in_project(project):
                sample_names.append(sample)
                for fq in self.fastqs_in_sample(project, sample):
                    project_dir.add_fastq(fq)
            # Add line to projects.info
            if project_name != 'undetermined':
                projects_info.write('%s\n' % '\t'.join(
                    (project, ','.join(sample_names), '.', '.', '.', '.',
                     '.')))
                # Add lines to custom_SampleSheet
                with open(os.path.join(self.dirn, 'custom_SampleSheet.csv'),
                          'a') as fp:
                    for sample in self.samples_in_project(project):
                        fp.write('%s,,,,,,%s,\n' % (sample, project_name))
            # Write the project directory to disk
            if not no_project_dirs:
                project_dir.create(top_dir=self.dirn)
        # Finished
        return self.dirn
Exemple #7
0
    def __init__(self,
                 run_name,
                 platform,
                 unaligned_dir='bcl2fastq',
                 fmt='bcl2fastq2',
                 paired_end=True,
                 lanes=None,
                 no_lane_splitting=False,
                 no_undetermined=False,
                 top_dir=None,
                 metadata=None,
                 readme=None,
                 project_metadata=None):
        """
        Create a mock-up of an analysis directory

        Arguments:
          run_name (str): name for the run e.g.
            '1130904_PJB_XXXXX'
          platform (str): name for the platform
            e.g. 'nextseq'
          unaligned_dir (str): for the bcl2fastq
            output directory (default: 'bcl2fastq')
          fmt (str): format of the outputs (can be
            'casava' or 'bcl2fastq2'; default is
            'bcl2fastq')
          paired_end (bool): whether run should be
            paired end (set True, default) or single
            end (set False)
          lanes (list): if not None then specify a
            list of lane numbers to include
          no_lane_splitting (bool): whether to
            mimic the '--no-lane-splitting' option
            of bcl2fastq2 in generating Fastq names
            (default: make separate Fastqs for each
            lane)
          no_undetermined (bool): whether to
            include 'undetermined' outputs (default:
            False, do include 'undetermined'
            outputs)
          top_dir (str): set parent directory to
            make the mock analysis directory in
            (default: current working directory)
          metadata (dict): if set then should be
            a dictionary of metadata items with
            corresponding values, which will be
            written to the metadata.info file
          readme (str): if set then will be
            written to a 'README' file in the mock
            analysis directory
          project_metadata (dict): if set then should
            be a dictionary where keys are names of
            projects and values are dictionaries of
            metadata items, which will be written to
            the README.info file for that project
        """
        # Make a mock-up of an analysis dir
        self.run_name = os.path.basename(str(run_name))
        self.platform = str(platform).lower()
        self.readme = readme
        # Store metadata
        self.metadata = {
            'run_name': self.run_name,
            'platform': self.platform,
        }
        if metadata is not None:
            for item in metadata:
                self.metadata[item] = metadata[item]
        self.project_metadata = dict()
        if project_metadata is not None:
            for project in project_metadata:
                self.project_metadata[project] = \
                            project_metadata[project]
        name = "%s_analysis" % run_name
        if top_dir is None:
            top_dir = os.getcwd()
        MockIlluminaData.__init__(self,
                                  name,
                                  fmt,
                                  unaligned_dir=unaligned_dir,
                                  paired_end=paired_end,
                                  no_lane_splitting=no_lane_splitting,
                                  top_dir=top_dir)
        # Add undetermined
        if not no_undetermined:
            self.add_undetermined(lanes=lanes)
Exemple #8
0
 def test_analyse_barcodes_with_bcl2fastq_dir_no_samplesheet(self):
     """
     AnalyseBarcodes: bcl2fastq directory as input (no samplesheet)
     """
     # Make a mock bcl2fastq output directory
     datadir = MockIlluminaData(os.path.join(
         self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                "bcl2fastq2",
                                unaligned_dir="bcl2fastq",
                                paired_end=True)
     datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
     datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
     datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
     datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
     datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
     datadir.create()
     # Add data to Fastq files
     self._insert_fastq_reads(
         os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
     # Set up and run pipeline
     p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
         self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
     exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                       working_dir=self.wd,
                       poll_interval=POLL_INTERVAL)
     # Check outputs
     self.assertEqual(exit_code, 0)
     self.assertTrue(
         os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
         "Missing dir: barcode_analysis")
     self.assertTrue(
         os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
         "Missing dir: barcode_analysis/counts")
     for f in (
             "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
             "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
             "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts",
             "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts",
             "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
     ):
         self.assertTrue(
             os.path.isfile(
                 os.path.join(self.wd, "barcode_analysis", "counts", f)),
             "Missing file: %s" % f)
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
         "Missing file: barcodes.report")
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
         "Missing file: barcodes.xls")
     self.assertTrue(
         os.path.isfile(
             os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
         "Missing file: barcodes.html")
     # Check that the report content is non-trivial
     barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                    "barcodes.report")
     with open(barcodes_report, 'rt') as fp:
         contents = fp.read()
         self.assertTrue("Barcode analysis for lane #1" in contents)
         self.assertTrue(
             "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
             in contents)
         # Expect 12 lines of content in total
         self.assertEqual(contents.count('\n'), 12)
Exemple #9
0
    def test_analyse_barcodes_with_samplesheet_and_10x_indices(self):
        """
        AnalyseBarcodes: sample sheet with 10xGenomics indices
        """
        # Create sample sheet
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
AB1,AB1,,,D501,SI-GA-A2,AB,
AB2,AB2,,,D501,SI-GA-B2,AB,
CDE3,CDE3,,,D501,SI-GA-C2,CDE,
CDE4,CDE4,,,D501,SI-GA-D2,CDE,
""")
        # Set up pipeline before bcl2fastq directory exists
        p = AnalyseBarcodes(sample_sheet=sample_sheet)
        # Create the bcl2fastq directory before running pipeline
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2")
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3")
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4")
        datadir.add_fastq_batch("", "Undetermined", "Undetermined_S0")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Run the pipeline
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          bcl2fastq_dir=os.path.join(
                              self.wd, "200428_M00879_0087_000000000-AGEW9",
                              "bcl2fastq"),
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        for f in (
                "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
                "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
                "CDE.CDE3_S3_L001_R1_001.fastq.gz.counts",
                "CDE.CDE4_S4_L001_R1_001.fastq.gz.counts",
                "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
        ):
            self.assertTrue(
                os.path.isfile(
                    os.path.join(self.wd, "barcode_analysis", "counts", f)),
                "Missing file: %s" % f)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            # Expect 12 lines of content in total
            self.assertEqual(contents.count('\n'), 12)
Exemple #10
0
    def test_analyse_barcodes_with_multi_lane_samplesheet(self):
        """
        AnalyseBarcodes: multi-lane sample sheet as input
        """
        # Create sample sheet
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
1,AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB,
1,AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB,
2,CDE3,CDE3,,,D701,GACCTGAA,D501,CGTGTAGG,CDE,
2,CDE4,CDE4,,,D702,ATGTAACT,D501,CGTGTAGG,CDE,
""")
        # Set up pipeline before bcl2fastq directory exists
        p = AnalyseBarcodes(sample_sheet=sample_sheet)
        # Create the bcl2fastq directory before running pipeline
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1", lanes=(1, ))
        datadir.add_fastq_batch("AB", "AB2", "AB2_S2", lanes=(1, ))
        datadir.add_fastq_batch("CDE", "CDE3", "CDE3_S3", lanes=(2, ))
        datadir.add_fastq_batch("CDE", "CDE4", "CDE4_S4", lanes=(2, ))
        datadir.add_fastq_batch("",
                                "Undetermined",
                                "Undetermined_S0",
                                lanes=(1, 2))
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Run the pipeline
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          bcl2fastq_dir=os.path.join(
                              self.wd, "200428_M00879_0087_000000000-AGEW9",
                              "bcl2fastq"),
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        for f in (
                "AB.AB1_S1_L001_R1_001.fastq.gz.counts",
                "AB.AB2_S2_L001_R1_001.fastq.gz.counts",
                "CDE.CDE3_S3_L002_R1_001.fastq.gz.counts",
                "CDE.CDE4_S4_L002_R1_001.fastq.gz.counts",
                "__undetermined__.Undetermined_S0_L001_R1_001.fastq.gz.counts"
        ):
            self.assertTrue(
                os.path.isfile(
                    os.path.join(self.wd, "barcode_analysis", "counts", f)),
                "Missing file: %s" % f)
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue("Barcode analysis for lane #2" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            self.assertTrue(
                "Problems detected:\n * Underrepresented samples" in contents)
            self.assertTrue(
                "   1\tTCCTGA\t\t1\t2\t100.0%\t(100.0%)" in contents)
            self.assertTrue(
                "The following samples are underrepresented:" in contents)
            for line in (
                    "AB1\tCGTGTAGG+GACCTGAA\t\t<0.1%",
                    "AB2\tCGTGTAGG+ATGTAACT\t\t<0.1%",
                    "CDE3\tGACCTGAA+CGTGTAGG\t\t<0.1%",
                    "CDE4\tATGTAACT+CGTGTAGG\t\t<0.1%",
            ):
                self.assertTrue(line in contents)
            # Expect at least 12 lines of content in total
            self.assertTrue(contents.count('\n') >= 12)
Exemple #11
0
    def test_analyse_barcodes_with_bcl2fastq_dir_and_samplesheet_empty_index(
            self):
        """
        AnalyseBarcodes: bcl2fastq directory as input (with samplesheet, empty index)
        """
        # Make a mock bcl2fastq output directory
        datadir = MockIlluminaData(os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9"),
                                   "bcl2fastq2",
                                   unaligned_dir="bcl2fastq",
                                   paired_end=True)
        datadir.add_fastq_batch("AB", "AB1", "AB1_S1")
        datadir.create()
        # Add data to Fastq files
        self._insert_fastq_reads(
            os.path.join(self.wd, "200428_M00879_0087_000000000-AGEW9"))
        # Create sample sheet with single empty index
        sample_sheet = os.path.join(self.wd, "custom_SampleSheet.csv")
        with open(sample_sheet, 'w') as fp:
            fp.write("""[Data]
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
AB1,AB1,,,,,,,AB,
""")
        # Set up and run pipeline
        p = AnalyseBarcodes(bcl2fastq_dir=os.path.join(
            self.wd, "200428_M00879_0087_000000000-AGEW9", "bcl2fastq"))
        exit_code = p.run(os.path.join(self.wd, "barcode_analysis"),
                          sample_sheet=sample_sheet,
                          working_dir=self.wd,
                          poll_interval=POLL_INTERVAL)
        # Check outputs
        self.assertEqual(exit_code, 0)
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis")),
            "Missing dir: barcode_analysis")
        self.assertTrue(
            os.path.isdir(os.path.join(self.wd, "barcode_analysis", "counts")),
            "Missing dir: barcode_analysis/counts")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "counts",
                             "AB.AB1_S1_L001_R1_001.fastq.gz.counts")),
            "Missing file: AB.AB1_S1_L001_R1_001.fastq.gz.counts")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.report")),
            "Missing file: barcodes.report")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.xls")),
            "Missing file: barcodes.xls")
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.wd, "barcode_analysis", "barcodes.html")),
            "Missing file: barcodes.html")
        # Check that the report content is non-trivial
        barcodes_report = os.path.join(self.wd, "barcode_analysis",
                                       "barcodes.report")
        with open(barcodes_report, 'rt') as fp:
            contents = fp.read()
            self.assertTrue("Barcode analysis for lane #1" in contents)
            self.assertTrue(
                "#Rank\tIndex\tSample\tN_seqs\tN_reads\t%reads\t(%Total_reads)"
                in contents)
            self.assertTrue(
                "Problems detected:\n * Underrepresented samples" in contents)
            self.assertTrue(
                "   1\tTCCTGA\t\t1\t1\t100.0%\t(100.0%)" in contents)
            self.assertTrue(
                "The following samples are underrepresented:" in contents)
            self.assertTrue("AB1\t\t\t<0.1%" in contents)
            # Expect at least 12 lines of content in total
            self.assertTrue(contents.count('\n') >= 12)