Example #1
0
def get_bases_mask(run_info_xml, sample_sheet_file=None):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads), and optionally updates this using the
    barcode information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: (optional) path to sample sheet file

    Returns:
      Bases mask string e.g. 'y101,I6'. 
    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    if sample_sheet_file is not None:
        # Update bases mask from sample sheet
        example_barcode = IlluminaData.samplesheet_index_sequence(
            IlluminaData.SampleSheet(sample_sheet_file).data[0])
        if example_barcode is None:
            example_barcode = ""
        if barcode_is_10xgenomics(example_barcode):
            print "Bases mask: barcode is 10xGenomics sample set ID"
        else:
            bases_mask = IlluminaData.fix_bases_mask(bases_mask,
                                                     example_barcode)
        print "Bases mask: %s (updated for barcode sequence '%s')" % \
            (bases_mask,example_barcode)
    return bases_mask
Example #2
0
 def setup(self):
     # Make output filenames
     report_file = os.path.join(self.args.barcode_analysis_dir,
                                'barcodes.report')
     xls_file = os.path.join(self.args.barcode_analysis_dir, 'barcodes.xls')
     html_file = os.path.join(self.args.barcode_analysis_dir,
                              'barcodes.html')
     # Remove existing copies, if found
     for filen in (report_file, xls_file, html_file):
         if os.path.exists(filen):
             os.remove(filen)
     # Build command to run the barcode analysis
     cmd = PipelineCommandWrapper(
         "Run analyse_barcodes.py to report barcodes",
         'analyse_barcodes.py', '--report', report_file, '--xls', xls_file,
         '--html', html_file)
     if self.args.sample_sheet:
         cmd.add_args('--sample-sheet', self.args.sample_sheet)
     if self.args.lanes:
         lanes = self.args.lanes
     elif self.args.sample_sheet:
         # Implicitly get lanes from sample sheet
         try:
             lanes = sorted(
                 set([
                     line['Lane'] for line in IlluminaData.SampleSheet(
                         self.args.sample_sheet).data
                 ]))
         except KeyError:
             # No lanes
             lanes = None
     else:
         lanes = None
     if lanes:
         cmd.add_args('--lanes', ','.join([str(l) for l in lanes]))
     if self.args.cutoff:
         cmd.add_args('--cutoff', self.args.cutoff)
     if self.args.mismatches:
         cmd.add_args('--mismatches', self.args.mismatches)
     if self.args.title:
         cmd.add_args('--title', self.args.title)
     cmd.add_args('-c')
     cmd.add_args(*self.args.counts_files)
     self.add_cmd(cmd)
     # Update the output parameters
     self.output.report_file.set(report_file)
     self.output.xls_file.set(xls_file)
     self.output.html_file.set(html_file)
Example #3
0
def check_barcode_collisions(sample_sheet_file, nmismatches):
    """
    Check sample sheet for barcode collisions

    Check barcode index sequences within each lane (or across
    all samples, if no lane information is present) and find
    any which differ in fewer bases than a threshold number
    which is calculated as:

    less than 2 times the number of mismatches plus 1

    (as is stated in the output from bcl2fastq v2.)

    Pairs of barcodes which are too similar (i.e. which collide)
    are reported as a list of tuples, e.g.

    [('ATTCCT','ATTCCG'),...]

    Arguments:
      sample_sheet_file (str): path to a SampleSheet.csv file
        to analyse for barcode collisions
      nmismatches (int): maximum number of mismatches to allow

    Returns:
      List: list of pairs of colliding barcodes (with each pair
        wrapped in a tuple), or an empty list if no collisions
        were detected.

    """
    # Load the sample sheet data
    sample_sheet = IlluminaData.SampleSheet(sample_sheet_file)
    # List of index sequences (barcodes)
    barcodes = {}
    has_lanes = sample_sheet.has_lanes
    for line in sample_sheet:
        # Lane
        if has_lanes:
            lane = line['Lane']
        else:
            lane = 1
        # Index sequence
        try:
            # Try dual-indexed IEM4 format
            indx = "%s%s" % (line['index'].strip(), line['index2'].strip())
        except KeyError:
            # Try single indexed IEM4 (no index2)
            try:
                indx = line['index'].strip()
            except KeyError:
                # Try CASAVA format
                try:
                    indx = line['Index'].strip()
                except KeyError:
                    # No index columns
                    indx = ""
        # Explicitly set empty index to None
        if not indx:
            indx = None
        try:
            barcodes[lane].append(indx)
        except KeyError:
            barcodes[lane] = [
                indx,
            ]
    # Mismatch threshold
    mismatch_threshold = 2 * nmismatches + 1
    # Check for collisions
    collisions = []
    for lane in barcodes:
        for i, seq1 in enumerate(barcodes[lane][:-1]):
            for seq2 in barcodes[lane][i + 1:]:
                ndiff = 0
                for c1, c2 in zip(seq1, seq2):
                    if c1 != c2:
                        ndiff += 1
                if ndiff < mismatch_threshold:
                    collisions.append((seq1, seq2))
    return collisions
Example #4
0
def make_custom_sample_sheet(input_sample_sheet,
                             output_sample_sheet=None,
                             lanes=None,
                             fmt=None):
    """
    Creates a corrected copy of a sample sheet file

    Creates and returns a SampleSheet object with a copy of the
    input sample sheet, with any illegal or duplicated names fixed.
    Optionally it can also: write the updated sample sheet data to a
    new file, switch the format, and include only a subset of lanes
    from the original file

    Arguments:
      input_sample_sheet (str): name and path of the original sample
        sheet file
      output_sample_sheet (str): (optional) name and path to write
        updated sample sheet to, or `None`
      lanes (list): (optional) list of lane numbers to keep in the
        output sample sheet; if `None` then all lanes will be kept
        (the default), otherwise lanes will be dropped if they don't
        appear in the supplied list
      fmt (str): (optional) format for the output sample sheet,
        either 'CASAVA' or 'IEM'; if this is `None` then the format
        of the original file will be used

    Returns:
      SampleSheet object with the data for the corrected sample
      sheet.

    """
    # Load the sample sheet data
    sample_sheet = IlluminaData.SampleSheet(input_sample_sheet)
    # Determine the column names for this format
    if sample_sheet.format == 'CASAVA':
        sample_col = 'SampleID'
        project_col = 'SampleProject'
    elif sample_sheet.format == 'IEM':
        sample_col = 'Sample_ID'
        project_col = 'Sample_Project'
    else:
        raise Exception("Unknown sample sheet format: %s" %
                        sample_sheet.format)
    # Add project names if not supplied
    for line in sample_sheet:
        if not line[project_col]:
            line[project_col] = line[sample_col]
    # Fix other problems
    sample_sheet.fix_illegal_names()
    sample_sheet.fix_duplicated_names()
    # Select subset of lanes if requested
    if lanes is not None:
        logging.debug("Updating to include only specified lanes: %s" %
                      ','.join([str(l) for l in lanes]))
        i = 0
        while i < len(sample_sheet):
            line = sample_sheet[i]
            if line['Lane'] in lanes:
                logging.debug("Keeping %s" % line)
                i += 1
            else:
                del (sample_sheet[i])
    # Write out new sample sheet
    if output_sample_sheet is not None:
        sample_sheet.write(output_sample_sheet, fmt=fmt)
    return sample_sheet
                                 "IEM sample sheet to older format)")
 p.add_argument('sample_sheet',metavar="SAMPLE_SHEET",
                help="input sample sheet file")
 # Process command line
 args = p.parse_args()
 if args.miseq:
     logging.warning("--miseq option no longer necessary; "
                     "MiSEQ-style sample sheets are now converted "
                     "automatically")
 # Get input sample sheet file
 samplesheet = args.sample_sheet
 if not os.path.isfile(samplesheet):
     logging.error("sample sheet '%s': not found" % samplesheet)
     sys.exit(1)
 # Read in the sample sheet
 data = IlluminaData.SampleSheet(samplesheet)
 if data.format is None:
     logging.error("Unable to determine samplesheet format")
     sys.exit(1)
 print "Sample sheet format: %s" % data.format
 # Remove lanes
 if args.lanes is not None:
     if not data.has_lanes:
         logging.error("sample sheet doesn't define any lanes")
         sys.exit(1)
     lanes = parse_lanes(args.lanes)
     print "Keeping lanes %s, removing the rest" % \
         ','.join([str(x) for x in lanes])
     i = 0
     while i < len(data):
         line = data[i]
Example #6
0
def check_barcode_collisions(sample_sheet_file, nmismatches, use_index='all'):
    """
    Check sample sheet for barcode collisions

    Check barcode index sequences within each lane (or across
    all samples, if no lane information is present) and find
    any which differ in fewer bases than a threshold number
    which is calculated as:

    less than 2 times the number of mismatches plus 1

    (as is stated in the output from bcl2fastq v2.)

    Pairs of barcodes which are too similar (i.e. which collide)
    are reported as a list of tuples, e.g.

    [('ATTCCT','ATTCCG'),...]

    Arguments:
      sample_sheet_file (str): path to a SampleSheet.csv file
        to analyse for barcode collisions
      nmismatches (int): maximum number of mismatches to allow
      use_index (str): flag indicating how to treat index
        sequences: 'all' (the default) combines indexes into a
        single sequence before checking for collisions, '1' only
        checks index 1 (i7), and '2' only checks index 2 (i5)

    Returns:
      List: list of pairs of colliding barcodes (with each pair
        wrapped in a tuple), or an empty list if no collisions
        were detected.

    """
    # Load the sample sheet data
    sample_sheet = IlluminaData.SampleSheet(sample_sheet_file)
    # Convert index flag to string
    use_index = str(use_index)
    # List of index sequences (barcodes)
    barcodes = {}
    has_lanes = sample_sheet.has_lanes
    for line in sample_sheet:
        # Lane
        if has_lanes:
            lane = line['Lane']
        else:
            lane = 1
        # Extract i7 index sequence
        indx_i7 = None
        try:
            # IEM4 format
            indx_i7 = line['index'].strip()
        except KeyError:
            # CASAVA format
            try:
                indx_i7 = line['Index'].strip()
            except KeyError:
                pass
        # Extract i5 index sequence
        indx_i5 = None
        try:
            # IEM4 format
            indx_i5 = line['index2'].strip()
        except KeyError:
            # No i5 for CASAVA
            pass
        # Assemble index sequence to check for mismatches
        if use_index == "all":
            # Combine i5 and i7 into a single sequence
            indx = "%s%s" % (indx_i7 if indx_i7 else '',
                             indx_i5 if indx_i5 else '')
        elif use_index == "1":
            # Only use i7
            indx = indx_i7
        elif use_index == "2":
            # Only use i5
            indx = indx_i5
        else:
            # Undefined index type
            raise Exception("Unrecognised index: '%s'" % use_index)
        # Explicitly set empty index to None
        if not indx:
            indx = None
        try:
            barcodes[lane].append(indx)
        except KeyError:
            barcodes[lane] = [
                indx,
            ]
    # Mismatch threshold
    mismatch_threshold = 2 * nmismatches + 1
    # Check for collisions
    collisions = []
    for lane in barcodes:
        for i, seq1 in enumerate(barcodes[lane][:-1]):
            for seq2 in barcodes[lane][i + 1:]:
                ndiff = 0
                for c1, c2 in zip(seq1, seq2):
                    if c1 != c2:
                        ndiff += 1
                if ndiff < mismatch_threshold:
                    collisions.append((seq1, seq2))
    return collisions
Example #7
0
def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                lanes=None,
                ignore_missing_bcl=False,
                ignore_missing_stats=False,
                skip_rsync=False,
                remove_primary_data=False,
                nprocessors=None,
                require_bcl2fastq_version=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                skip_fastq_generation=False,
                only_fetch_primary_data=False,
                create_empty_fastqs=None,
                runner=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False):
    """Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      nprocessors (int) : number of processors to run bclToFastq.py with
      ignore_missing_bcl (bool): if True then run bcl2fastq with
        --ignore-missing-bcl
      ignore_missing_stats (bool): if True then run bcl2fastq with
        --ignore-missing-stats
      skip_rsync (bool): if True then don't rsync primary data at the
        start of bcl2fastq conversion
      remove_primary_data (bool): if True then remove primary data at
        the end of bcl2fastq conversion (default is to keep it)
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      require_bcl2fastq_version (str): (optional) specify bcl2fastq
        version to use. Should be a string of the form '1.8.4' or
        '>2.0'. Set to None to automatically determine required
        bcl2fastq version.
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      skip_fastq_generation (bool): if True then don't perform fastq
        generation
      only_fetch_primary_data (bool): if True then fetch primary data,
        don't do anything else
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
    """
    # Report protocol
    print "Protocol              : %s" % protocol
    if protocol not in MAKE_FASTQS_PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS])))
    # Unaligned dir
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print "Output dir            : %s" % ap.params.unaligned_dir
    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print "Source sample sheet   : %s" % ap.params.sample_sheet
    # Check requested lanes are actually present
    print "Lanes                 : %s" % ('all' if lanes is None else ','.join(
        [str(l) for l in lanes]))
    if lanes is not None:
        s = IlluminaData.SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)
    # Make a temporary sample sheet
    if lanes:
        lanes_id = ".L%s" % ''.join([str(l) for l in lanes])
    else:
        lanes_id = ""
    sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S")))
    make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes)
    # Check the temporary sample sheet
    print "Checking temporary sample sheet"
    invalid_barcodes = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_barcodes()
    if invalid_barcodes:
        logger.error("Invalid barcodes detected")
        for line in invalid_barcodes:
            logger.critical("%s" % line)
    invalid_characters = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_characters()
    if invalid_characters:
        logger.critical("Invalid non-printing/non-ASCII characters "
                        "detected")
    if invalid_barcodes or invalid_characters:
        raise Exception("Errors detected in generated sample sheet")
    # Adjust verification settings for 10xGenomics Chromium SC
    # data if necessary
    verify_include_sample_dir = False
    if has_chromium_sc_indices(sample_sheet):
        if protocol in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            # Force inclusion of sample-name subdirectories
            # when verifying Chromium SC data
            print "Sample sheet includes Chromium SC indices"
            verify_include_sample_dir = True
        else:
            # Chromium SC indices detected but not using
            # 10x_chromium_sc protocol
            raise Exception("Detected 10xGenomics Chromium SC indices "
                            "in generated sample sheet but protocol "
                            "'%s' has been specified; use an "
                            "appropriate '10x_...' protocol for these "
                            "indices" % protocol)
    # Check for pre-existing Fastq outputs
    if verify_fastq_generation(ap,
                               unaligned_dir=ap.params.unaligned_dir,
                               lanes=lanes,
                               include_sample_dir=verify_include_sample_dir):
        print "Expected Fastq outputs already present"
        skip_rsync = True
        skip_fastq_generation = True
    # Check if there's anything to do
    if (skip_rsync and skip_fastq_generation) and \
       not (generate_stats or analyse_barcodes):
        print "Nothing to do"
        return
    # Log dir
    log_dir = 'make_fastqs'
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))
    # Fetch primary data
    if not skip_rsync and not ap.params.acquired_primary_data:
        if get_primary_data(ap) != 0:
            logger.error("Failed to acquire primary data")
            raise Exception("Failed to acquire primary data")
        else:
            ap.params['acquired_primary_data'] = True
    if only_fetch_primary_data:
        return
    # Deal with platform information
    if not platform:
        platform = ap.metadata.platform
    # Do fastq generation using the specified protocol
    if not skip_fastq_generation:
        # Set primary data location and report info
        primary_data_dir = os.path.join(ap.params.primary_data_dir,
                                        os.path.basename(ap.params.data_dir))
        print "Primary data dir      : %s" % primary_data_dir
        try:
            illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                                    platform=platform)
        except IlluminaData.IlluminaDataPlatformError as ex:
            logger.critical("Error loading primary data: %s" % ex)
            if platform is None:
                logger.critical("Try specifying platform using --platform?")
            else:
                logger.critical("Check specified platform is valid (or "
                                "omit --platform")
            raise Exception("Error determining sequencer platform")
        print "Platform              : %s" % illumina_run.platform
        print "Bcl format            : %s" % illumina_run.bcl_extension
        # Set platform in metadata
        ap.metadata['platform'] = illumina_run.platform
        # Bases mask
        if bases_mask is not None:
            ap.params['bases_mask'] = bases_mask
        bases_mask = ap.params.bases_mask
        print "Bases mask setting    : %s" % bases_mask
        if protocol not in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            if bases_mask == "auto":
                print "Determining bases mask from RunInfo.xml"
                bases_mask = get_bases_mask(illumina_run.runinfo_xml,
                                            sample_sheet)
                if not bases_mask_is_valid(bases_mask):
                    raise Exception("Invalid bases mask: '%s'" % bases_mask)
        # Do fastq generation according to protocol
        if protocol == 'icell8':
            # ICell8 data
            # Update bcl2fastq settings appropriately
            print "Updating read trimming and masking for ICell8"
            minimum_trimmed_read_length = 21
            mask_short_adapter_reads = 0
            # Reset the default bases mask
            bases_mask = IlluminaData.IlluminaRunInfo(
                illumina_run.runinfo_xml).bases_mask
            bases_mask = get_icell8_bases_mask(bases_mask,
                                               sample_sheet=sample_sheet)
            if not bases_mask_is_valid(bases_mask):
                raise Exception("Invalid bases mask: '%s'" % bases_mask)
            # Switch to standard protocol
            protocol = 'standard'
        if protocol == 'standard':
            # Standard protocol
            try:
                exit_code = bcl_to_fastq(
                    ap,
                    unaligned_dir=ap.params.unaligned_dir,
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    require_bcl2fastq=require_bcl2fastq_version,
                    bases_mask=bases_mask,
                    ignore_missing_bcl=ignore_missing_bcl,
                    ignore_missing_stats=ignore_missing_stats,
                    no_lane_splitting=no_lane_splitting,
                    minimum_trimmed_read_length=minimum_trimmed_read_length,
                    mask_short_adapter_reads=mask_short_adapter_reads,
                    nprocessors=nprocessors,
                    runner=runner)
            except Exception as ex:
                raise Exception("Bcl2fastq stage failed: '%s'" % ex)
        elif protocol == '10x_chromium_sc':
            # 10xGenomics Chromium SC
            if bases_mask == 'auto':
                bases_mask = None
            try:
                # Check we have cellranger
                cellranger = find_program('cellranger')
                if not cellranger:
                    raise Exception("No cellranger package found")
                cellranger_software_info = cellranger_info(cellranger)
                print "Using cellranger %s: %s" % \
                    (cellranger_software_info[-1],
                     cellranger)
                # Check we have bcl2fastq
                bcl2fastq = find_program('bcl2fastq')
                if not bcl2fastq:
                    raise Exception("No bcl2fastq package found")
                bcl2fastq = available_bcl2fastq_versions(
                    paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
                if not bcl2fastq:
                    raise Exception("No appropriate bcl2fastq software "
                                    "located")
                bcl2fastq = bcl2fastq[0]
                bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
                print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1],
                                                  bcl2fastq)
                # Store info on bcl2fastq package
                ap.metadata['bcl2fastq_software'] = bcl2fastq_info
                # Store info on cellranger package
                ap.metadata['cellranger_software'] = cellranger_software_info
                # Put a copy of sample sheet in the log directory
                shutil.copy(sample_sheet, ap.log_dir)
                # Determine output directory absolute path
                output_dir = ap.params.unaligned_dir
                if not os.path.isabs(output_dir):
                    output_dir = os.path.join(ap.analysis_dir, output_dir)
                # Run cellranger mkfastq
                exit_code = run_cellranger_mkfastq(
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    output_dir=output_dir,
                    lanes=(None if lanes is None else ','.join(
                        [str(l) for l in lanes])),
                    bases_mask=bases_mask,
                    cellranger_exe=cellranger,
                    cellranger_jobmode=cellranger_jobmode,
                    cellranger_maxjobs=cellranger_maxjobs,
                    cellranger_mempercore=cellranger_mempercore,
                    cellranger_jobinterval=cellranger_jobinterval,
                    cellranger_localcores=cellranger_localcores,
                    cellranger_localmem=cellranger_localmem,
                    working_dir=ap.analysis_dir,
                    log_dir=ap.log_dir)
            except Exception as ex:
                raise Exception("'cellranger mkfastq' stage failed: "
                                "'%s'" % ex)
            # Turn off barcode analysis
            analyse_barcodes = False
        elif protocol == '10x_chromium_sc_atac':
            # 10xGenomics Chromium scATAC-seq
            exit_code = bcl_to_fastq_10x_chromium_sc_atac(
                ap,
                output_dir=ap.params.unaligned_dir,
                sample_sheet=sample_sheet,
                primary_data_dir=primary_data_dir,
                lanes=lanes,
                bases_mask=bases_mask,
                cellranger_jobmode=cellranger_jobmode,
                cellranger_maxjobs=cellranger_maxjobs,
                cellranger_mempercore=cellranger_mempercore,
                cellranger_jobinterval=cellranger_jobinterval,
                cellranger_localcores=cellranger_localcores,
                cellranger_localmem=cellranger_localmem,
                log_dir=ap.log_dir)
            # Turn off barcode analysis
            analyse_barcodes = False
        else:
            # Unknown protocol
            raise Exception("Unknown protocol '%s'" % protocol)
        # Check the outputs
        if exit_code != 0:
            raise Exception("Fastq generation finished with error: "
                            "exit code %d" % exit_code)
        if not verify_fastq_generation(
                ap, lanes=lanes, include_sample_dir=verify_include_sample_dir):
            # Check failed
            logger.error("Failed to verify output Fastqs against "
                         "sample sheet")
            # Try to load the data from unaligned dir
            try:
                illumina_data = IlluminaData.IlluminaData(
                    ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir)
            except IlluminaData.IlluminaDataError as ex:
                raise Exception("Unable to load data from %s: %s" %
                                (ap.params.unaligned_dir, ex))
            # Generate a list of missing Fastqs
            missing_fastqs = IlluminaData.list_missing_fastqs(
                illumina_data,
                sample_sheet,
                include_sample_dir=verify_include_sample_dir)
            assert (len(missing_fastqs) > 0)
            missing_fastqs_file = os.path.join(ap.log_dir,
                                               "missing_fastqs.log")
            print "Writing list of missing Fastq files to %s" % \
                missing_fastqs_file
            with open(missing_fastqs_file, 'w') as fp:
                for fq in missing_fastqs:
                    fp.write("%s\n" % fq)
            # Create empty FASTQs
            if create_empty_fastqs is None:
                try:
                    create_empty_fastqs = \
                        ap.settings.platform[ap.metadata.platform].\
                        create_empty_fastqs
                except (KeyError, AttributeError):
                    pass
            if create_empty_fastqs is None:
                create_empty_fastqs = \
                    ap.settings.bcl2fastq.create_empty_fastqs
            if create_empty_fastqs:
                logger.warning("Making 'empty' placeholder Fastqs")
                for fq in missing_fastqs:
                    fastq = os.path.join(ap.analysis_dir,
                                         ap.params.unaligned_dir, fq)
                    print "-- %s" % fastq
                    if not os.path.exists(os.path.dirname(fastq)):
                        mkdirs(os.path.dirname(fastq))
                    with gzip.GzipFile(filename=fastq, mode='wb') as fp:
                        fp.write('')
            else:
                raise Exception("Fastq generation failed to produce "
                                "expected outputs")
    # Generate statistics
    if generate_stats:
        fastq_statistics(ap,
                         stats_file=stats_file,
                         per_lane_stats_file=per_lane_stats_file,
                         unaligned_dir=ap.params.unaligned_dir,
                         nprocessors=nprocessors,
                         runner=runner)
    # Run barcode analysis
    if analyse_barcodes:
        # Determine output directory
        if barcode_analysis_dir is not None:
            ap.params['barcode_analysis_dir'] = barcode_analysis_dir
        elif ap.params.barcode_analysis_dir is None:
            ap.params['barcode_analysis_dir'] = 'barcode_analysis'
        barcode_analysis_dir = ap.params.barcode_analysis_dir
        if not os.path.isabs(barcode_analysis_dir):
            barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                                barcode_analysis_dir)
        # Report title
        title = "Barcode analysis for %s" % ap.metadata.run_name
        # Log file
        log_file = os.path.join(ap.log_dir, "analyse_barcodes.log")
        # Set up runner
        if runner is None:
            runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        # Get scheduler parameters
        max_jobs = ap.settings.general.max_concurrent_jobs
        poll_interval = ap.settings.general.poll_interval
        # Create and run barcode analysis pipeline
        barcode_analysis = AnalyseBarcodes(
            os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir))
        barcode_analysis.run(barcode_analysis_dir,
                             title=title,
                             lanes=lanes,
                             sample_sheet=sample_sheet,
                             log_file=log_file,
                             runner=runner,
                             max_jobs=max_jobs,
                             poll_interval=poll_interval,
                             verbose=False)
    # Make a 'projects.info' metadata file
    if lanes:
        ap.update_project_metadata_file()
    else:
        ap.make_project_metadata_file()
    # Remove primary data
    if remove_primary_data:
        remove_primary_data(ap)