예제 #1
0
def get_bases_mask(run_info_xml, sample_sheet_file=None):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads), and optionally updates this using the
    barcode information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: (optional) path to sample sheet file

    Returns:
      Bases mask string e.g. 'y101,I6'. 
    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    if sample_sheet_file is not None:
        # Update bases mask from sample sheet
        example_barcode = IlluminaData.samplesheet_index_sequence(
            IlluminaData.SampleSheet(sample_sheet_file).data[0])
        if example_barcode is None:
            example_barcode = ""
        if barcode_is_10xgenomics(example_barcode):
            print "Bases mask: barcode is 10xGenomics sample set ID"
        else:
            bases_mask = IlluminaData.fix_bases_mask(bases_mask,
                                                     example_barcode)
        print "Bases mask: %s (updated for barcode sequence '%s')" % \
            (bases_mask,example_barcode)
    return bases_mask
예제 #2
0
def get_bases_mask(run_info_xml, sample_sheet_file):
    """
    Get bases mask string

    Generates initial bases mask based on data in RunInfo.xml (which
    says how many reads there are, how many cycles in each read, and
    which are index reads). Then updates this using the barcode
    information in the sample sheet file.

    Arguments:
      run_info_xml: name and path of RunInfo.xml file from the
        sequencing run
      sample_sheet_file: name and path of sample sheet file.

    Returns:
      Bases mask string e.g. 'y101,I6'. 

    """
    # Get initial bases mask
    bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask
    print "Bases mask: %s (from RunInfo.xml)" % bases_mask
    # Update bases mask from sample sheet
    example_barcode = IlluminaData.get_casava_sample_sheet(
        sample_sheet_file)[0]['Index']
    bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode)
    print "Bases mask: %s (updated for barcode sequence '%s')" % (
        bases_mask, example_barcode)
    return bases_mask
예제 #3
0
def main():
    p = optparse.OptionParser(
        usage="%prog [OPTIONS] ILLUMINA_RUN_DIR OUTPUT_DIR [ SAMPLE_SHEET ]",
        version="%prog "+__version__,
        description="Wrapper to automate the Illumina bcl to fastq "
        "conversion process. It will either run the CASAVA/bcl2fastq v1.8 "
        "configureBclToFastq.pl/make pipeline or bcl2fastq v2 directly, "
        "depending on which software package is detected. ILLUMINA_RUN_DIR "
        "is the top-level directory of the Illumina run to be processed; "
        "output will be written to OUTPUT_DIR. Optionally a SAMPLE_SHEET "
        "file can also be specified, otherwise the SampleSheet.csv file in "
        "the BaseCalls directory will be used (if present).")
    # Options common to both bcl2fastq/bcl2fastq v2
    p.add_option('--nmismatches',action="store",dest="nmismatches",
                 default=None,
                 help="set number of mismatches to allow; recommended "
                 "values are 0 for samples without multiplexing, 1 for "
                 "multiplexed samples with tags of length 6 or longer "
                 "(CASAVA/bcl2fastq v1.8 --mismatches option, bcl2fastq "
                 "v2 --barcode-mismatches option)")
    p.add_option('--use-bases-mask',action="store",dest="bases_mask",
                 default=None,
                 help="specify a bases-mask string to tell CASAVA how "
                 "to use each cycle (the supplied value is passed "
                "to the --use-bases-mask option)")
    p.add_option('--nprocessors',action="store",dest="nprocessors",
                 default=None,
                 help="set the number of processors to use (defaults to "
                 "1; for CASAVA/bcl2fastq v1.8 this is passed to the "
                 "-j option of the 'make' step after running "
                 "configureBcltoFastq.pl, for bcl2fastq v2 this is "
                 "the maximum number of CPUs that should be used by "
                 "the -r, -d, -p and -w options)")
    p.add_option('--ignore-missing-bcl',action="store_true",
                 dest="ignore_missing_bcl",default=False,
                 help="interpret missing bcl files as no call "
                 "(CASAVA/bcl2fastq v1.8 --ignore-missing-bcl option, "
                 "bcl2fastq v2 --ignore-missing-bcls option)")
    p.add_option('--bcl2fastq_path',action="store",
                 dest="bcl2fastq_path",default=None,
                 help="explicitly specify the path to the CASAVA or "
                 "bcl2fastq software to use.")
    # CASAVA/bcl2fastq 1.8.* only
    casava = optparse.OptionGroup(p,'CASAVA/bcl2fastq v1.8 only')
    casava.add_option('--ignore-missing-stats',action="store_true",
                      dest="ignore_missing_stats",default=False,
                      help="fill in with zeroes when *.stats files are missing "
                      "(see the CASAVA user guide for details of how "
                      "--ignore-missing-stats works)")
    casava.add_option('--ignore-missing-control',action="store_true",
                      dest="ignore_missing_control",default=False,
                 help="interpret missing control files as not-set control "
                      "bits (see the CASAVA user guide for details of how "
                      "--ignore-missing-control works)")
    p.add_option_group(casava)
    # bcl2fastq 2 only
    bcl2fastq2 = optparse.OptionGroup(p,'bcl2fastq v2 only')
    bcl2fastq2.add_option('--no-lane-splitting',action="store_true",
                          dest="no_lane_splitting",default=False,
                          help="Don't split output FASTQ files by lane")
    # Adapter trimming (bcl2fastq 2 only)
    adapter_trimming = optparse.OptionGroup(p,'Adapter trimming (bcl2fastq v2 only)')
    adapter_trimming.add_option('--minimum-trimmed-read-length',action="store",
                                dest="minimum_trimmed_read_length",default=35,
                                help="Minimum read length after adapter "
                                "trimming. bcl2fastq trims the adapter from "
                                "the read down to this value; if there is more "
                                "adapter match below this length then those "
                                "bases are masked not trimmed (i.e. replaced "
                                "by N rather than removed) (default: 35)")
    adapter_trimming.add_option('--mask-short-adapter-reads',action="store",
                                dest="mask_short_adapter_reads",default=22,
                                help="minimum length of unmasked bases that "
                                "a read can be after adapter trimming; reads "
                                "with fewer ACGT bases will be completely "
                                "masked with Ns (default: 22)")
    p.add_option_group(adapter_trimming)
    # Advanced options
    advanced = optparse.OptionGroup(p,'Advanced options')
    advanced.add_option('--platform',action="store",
                        dest="platform",default=None,
                        help="Explicitly specify platform; only use this if "
                        "the platform can't be read from the instrument name")
    p.add_option_group(advanced)

    options,args = p.parse_args()
    if not (2 <= len(args) <=3):
        p.error("input is an input directory, output directory and an "
                "optional sample sheet")
    # Acquire bcl2fastq software
    bcl2fastq = available_bcl2fastq_versions(paths=(options.bcl2fastq_path,))
    if not bcl2fastq:
        logging.error("No bcl2fastq software found")
        return 1
    else:
        bcl2fastq_exe = bcl2fastq[0]
    # Determine bcl2fastq version
    bcl2fastq_info = bcl_to_fastq_info(bcl2fastq_exe)
    if bcl2fastq_info[0] is None:
        logging.error("No bcl2fastq software found")
        return 1
    print "Using conversion software from %s" % os.path.dirname(
        bcl2fastq_info[0])
    # Return with error code if no version detected
    bcl2fastq_package = bcl2fastq_info[1]
    bcl2fastq_version = bcl2fastq_info[2]
    if bcl2fastq_version is None:
        logging.error("Cannot determine bcl2fastq software version")
        return 1
    print "Package: %s" % bcl2fastq_package
    print "Version: %s" % bcl2fastq_version
    known_version = None
    for version in BCL2FASTQ_VERSIONS:
        if bcl2fastq_version.startswith("%s." % version):
            known_version = version
            break
    if known_version is None:
        # Unimplemented version
        logging.error("Don't know how to run bcl2fastq version %s" %
                      bcl2fastq_version)
        return 1
    # Locate run directory (and strip any trailing slash)
    illumina_run_dir = os.path.abspath(args[0].rstrip(os.sep))
    if not os.path.isdir(illumina_run_dir):
        logging.error("%s: doesn't exist or is not a directory" %
                      illumina_run_dir)
        sys.exit(1)
    illumina_run = IlluminaData.IlluminaRun(illumina_run_dir,
                                            options.platform)
    # Output directory
    output_dir = os.path.abspath(args[1].rstrip(os.sep))
    # Sample sheet
    if len(args) == 3:
        sample_sheet = os.path.abspath(args[2])
    else:
        sample_sheet = illumina_run.sample_sheet_csv
    # Bases mask
    if options.bases_mask is not None:
        bases_mask = options.bases_mask
    else:
        bases_mask = IlluminaData.IlluminaRunInfo(
            illumina_run.runinfo_xml).bases_mask
    # Report settings
    print "Illumina run directory  : %s" % illumina_run.run_dir
    print "Basecalls directory     : %s" % illumina_run.basecalls_dir
    print "Platform                : %s" % illumina_run.platform
    print "Bcl file extension      : %s" % illumina_run.bcl_extension
    print "SampleSheet.csv file    : %s" % sample_sheet
    print "Output dir              : %s" % output_dir
    print "Nmismatches             : %s" % options.nmismatches
    print "Bases mask              : %s" % bases_mask
    print "Nprocessors             : %s" % options.nprocessors
    print "Ignore missing bcl      : %s" % options.ignore_missing_bcl
    if known_version == '1.8':
        print "Ignore missing stats    : %s" % options.ignore_missing_stats
        print "Ignore missing control  : %s" % options.ignore_missing_control
    elif known_version in ('2.17','2.20',):
        print "No lane splitting       : %s" % options.no_lane_splitting
        print "Min trimmed read length : %s" % \
            options.minimum_trimmed_read_length
        print "Mask short adapter reads: %s" % \
            options.mask_short_adapter_reads
    # Run bclToFastq conversion based on the version
    if known_version in ('1.8',):
        # 1.8.* pipeline
        status = run_bcl2fastq_1_8(
            illumina_run.basecalls_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            nprocessors=options.nprocessors,
            force=True,
            ignore_missing_bcl=options.ignore_missing_bcl,
            ignore_missing_stats=options.ignore_missing_stats,
            ignore_missing_control=options.ignore_missing_control
        )
    elif known_version in ('2.17',):
        # bcl2fastq 2.17.*
        if options.nprocessors is not None:
            # Explicitly set number of threads for each stage
            nprocessors=int(options.nprocessors)
            loading_threads=min(4,nprocessors)
            writing_threads=min(4,nprocessors)
            demultiplexing_threads=max(int(float(nprocessors)*0.2),
                                       nprocessors)
            processing_threads=nprocessors
            print "Explicitly setting number of threads for each stage:"
            print "Loading (-r)       : %d" % loading_threads
            print "Demultiplexing (-d): %d" % demultiplexing_threads
            print "Processing (-p)    : %d" % processing_threads
            print "Writing (-w)       : %d" % writing_threads
        else:
            # Use the defaults
            loading_threads = None
            demultiplexing_threads = None
            processing_threads = None
            writing_threads = None
        # Run the bcl to fastq conversion
        status = run_bcl2fastq_2_17(
            illumina_run.run_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            ignore_missing_bcl=options.ignore_missing_bcl,
            no_lane_splitting=options.no_lane_splitting,
            minimum_trimmed_read_length=options.minimum_trimmed_read_length,
            mask_short_adapter_reads=options.mask_short_adapter_reads,
            loading_threads=loading_threads,
            demultiplexing_threads=demultiplexing_threads,
            processing_threads=processing_threads,
            writing_threads=writing_threads
        )
    elif known_version in ('2.20',):
        # bcl2fastq 2.20.*
        if options.nprocessors is not None:
            # Explicitly set number of threads for each stage
            nprocessors=int(options.nprocessors)
            loading_threads=min(4,nprocessors)
            writing_threads=min(4,nprocessors)
            processing_threads=nprocessors
            print "Explicitly setting number of threads for each stage:"
            print "Loading (-r)       : %d" % loading_threads
            print "Processing (-p)    : %d" % processing_threads
            print "Writing (-w)       : %d" % writing_threads
        else:
            # Use the defaults
            loading_threads = None
            processing_threads = None
            writing_threads = None
        # Run the bcl to fastq conversion
        status = run_bcl2fastq_2_20(
            illumina_run.run_dir,
            sample_sheet,
            output_dir=output_dir,
            mismatches=options.nmismatches,
            bases_mask=options.bases_mask,
            ignore_missing_bcl=options.ignore_missing_bcl,
            no_lane_splitting=options.no_lane_splitting,
            minimum_trimmed_read_length=options.minimum_trimmed_read_length,
            mask_short_adapter_reads=options.mask_short_adapter_reads,
            loading_threads=loading_threads,
            processing_threads=processing_threads,
            writing_threads=writing_threads
        )
    print "bclToFastq returncode: %s" % status
    if status != 0:
        logging.error("bclToFastq failure")
    return status
예제 #4
0
def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                lanes=None,
                ignore_missing_bcl=False,
                ignore_missing_stats=False,
                skip_rsync=False,
                remove_primary_data=False,
                nprocessors=None,
                require_bcl2fastq_version=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                skip_fastq_generation=False,
                only_fetch_primary_data=False,
                create_empty_fastqs=None,
                runner=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False):
    """Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      nprocessors (int) : number of processors to run bclToFastq.py with
      ignore_missing_bcl (bool): if True then run bcl2fastq with
        --ignore-missing-bcl
      ignore_missing_stats (bool): if True then run bcl2fastq with
        --ignore-missing-stats
      skip_rsync (bool): if True then don't rsync primary data at the
        start of bcl2fastq conversion
      remove_primary_data (bool): if True then remove primary data at
        the end of bcl2fastq conversion (default is to keep it)
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      require_bcl2fastq_version (str): (optional) specify bcl2fastq
        version to use. Should be a string of the form '1.8.4' or
        '>2.0'. Set to None to automatically determine required
        bcl2fastq version.
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      skip_fastq_generation (bool): if True then don't perform fastq
        generation
      only_fetch_primary_data (bool): if True then fetch primary data,
        don't do anything else
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
    """
    # Report protocol
    print "Protocol              : %s" % protocol
    if protocol not in MAKE_FASTQS_PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS])))
    # Unaligned dir
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print "Output dir            : %s" % ap.params.unaligned_dir
    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print "Source sample sheet   : %s" % ap.params.sample_sheet
    # Check requested lanes are actually present
    print "Lanes                 : %s" % ('all' if lanes is None else ','.join(
        [str(l) for l in lanes]))
    if lanes is not None:
        s = IlluminaData.SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)
    # Make a temporary sample sheet
    if lanes:
        lanes_id = ".L%s" % ''.join([str(l) for l in lanes])
    else:
        lanes_id = ""
    sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S")))
    make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes)
    # Check the temporary sample sheet
    print "Checking temporary sample sheet"
    invalid_barcodes = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_barcodes()
    if invalid_barcodes:
        logger.error("Invalid barcodes detected")
        for line in invalid_barcodes:
            logger.critical("%s" % line)
    invalid_characters = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_characters()
    if invalid_characters:
        logger.critical("Invalid non-printing/non-ASCII characters "
                        "detected")
    if invalid_barcodes or invalid_characters:
        raise Exception("Errors detected in generated sample sheet")
    # Adjust verification settings for 10xGenomics Chromium SC
    # data if necessary
    verify_include_sample_dir = False
    if has_chromium_sc_indices(sample_sheet):
        if protocol in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            # Force inclusion of sample-name subdirectories
            # when verifying Chromium SC data
            print "Sample sheet includes Chromium SC indices"
            verify_include_sample_dir = True
        else:
            # Chromium SC indices detected but not using
            # 10x_chromium_sc protocol
            raise Exception("Detected 10xGenomics Chromium SC indices "
                            "in generated sample sheet but protocol "
                            "'%s' has been specified; use an "
                            "appropriate '10x_...' protocol for these "
                            "indices" % protocol)
    # Check for pre-existing Fastq outputs
    if verify_fastq_generation(ap,
                               unaligned_dir=ap.params.unaligned_dir,
                               lanes=lanes,
                               include_sample_dir=verify_include_sample_dir):
        print "Expected Fastq outputs already present"
        skip_rsync = True
        skip_fastq_generation = True
    # Check if there's anything to do
    if (skip_rsync and skip_fastq_generation) and \
       not (generate_stats or analyse_barcodes):
        print "Nothing to do"
        return
    # Log dir
    log_dir = 'make_fastqs'
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))
    # Fetch primary data
    if not skip_rsync and not ap.params.acquired_primary_data:
        if get_primary_data(ap) != 0:
            logger.error("Failed to acquire primary data")
            raise Exception("Failed to acquire primary data")
        else:
            ap.params['acquired_primary_data'] = True
    if only_fetch_primary_data:
        return
    # Deal with platform information
    if not platform:
        platform = ap.metadata.platform
    # Do fastq generation using the specified protocol
    if not skip_fastq_generation:
        # Set primary data location and report info
        primary_data_dir = os.path.join(ap.params.primary_data_dir,
                                        os.path.basename(ap.params.data_dir))
        print "Primary data dir      : %s" % primary_data_dir
        try:
            illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                                    platform=platform)
        except IlluminaData.IlluminaDataPlatformError as ex:
            logger.critical("Error loading primary data: %s" % ex)
            if platform is None:
                logger.critical("Try specifying platform using --platform?")
            else:
                logger.critical("Check specified platform is valid (or "
                                "omit --platform")
            raise Exception("Error determining sequencer platform")
        print "Platform              : %s" % illumina_run.platform
        print "Bcl format            : %s" % illumina_run.bcl_extension
        # Set platform in metadata
        ap.metadata['platform'] = illumina_run.platform
        # Bases mask
        if bases_mask is not None:
            ap.params['bases_mask'] = bases_mask
        bases_mask = ap.params.bases_mask
        print "Bases mask setting    : %s" % bases_mask
        if protocol not in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            if bases_mask == "auto":
                print "Determining bases mask from RunInfo.xml"
                bases_mask = get_bases_mask(illumina_run.runinfo_xml,
                                            sample_sheet)
                if not bases_mask_is_valid(bases_mask):
                    raise Exception("Invalid bases mask: '%s'" % bases_mask)
        # Do fastq generation according to protocol
        if protocol == 'icell8':
            # ICell8 data
            # Update bcl2fastq settings appropriately
            print "Updating read trimming and masking for ICell8"
            minimum_trimmed_read_length = 21
            mask_short_adapter_reads = 0
            # Reset the default bases mask
            bases_mask = IlluminaData.IlluminaRunInfo(
                illumina_run.runinfo_xml).bases_mask
            bases_mask = get_icell8_bases_mask(bases_mask,
                                               sample_sheet=sample_sheet)
            if not bases_mask_is_valid(bases_mask):
                raise Exception("Invalid bases mask: '%s'" % bases_mask)
            # Switch to standard protocol
            protocol = 'standard'
        if protocol == 'standard':
            # Standard protocol
            try:
                exit_code = bcl_to_fastq(
                    ap,
                    unaligned_dir=ap.params.unaligned_dir,
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    require_bcl2fastq=require_bcl2fastq_version,
                    bases_mask=bases_mask,
                    ignore_missing_bcl=ignore_missing_bcl,
                    ignore_missing_stats=ignore_missing_stats,
                    no_lane_splitting=no_lane_splitting,
                    minimum_trimmed_read_length=minimum_trimmed_read_length,
                    mask_short_adapter_reads=mask_short_adapter_reads,
                    nprocessors=nprocessors,
                    runner=runner)
            except Exception as ex:
                raise Exception("Bcl2fastq stage failed: '%s'" % ex)
        elif protocol == '10x_chromium_sc':
            # 10xGenomics Chromium SC
            if bases_mask == 'auto':
                bases_mask = None
            try:
                # Check we have cellranger
                cellranger = find_program('cellranger')
                if not cellranger:
                    raise Exception("No cellranger package found")
                cellranger_software_info = cellranger_info(cellranger)
                print "Using cellranger %s: %s" % \
                    (cellranger_software_info[-1],
                     cellranger)
                # Check we have bcl2fastq
                bcl2fastq = find_program('bcl2fastq')
                if not bcl2fastq:
                    raise Exception("No bcl2fastq package found")
                bcl2fastq = available_bcl2fastq_versions(
                    paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
                if not bcl2fastq:
                    raise Exception("No appropriate bcl2fastq software "
                                    "located")
                bcl2fastq = bcl2fastq[0]
                bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
                print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1],
                                                  bcl2fastq)
                # Store info on bcl2fastq package
                ap.metadata['bcl2fastq_software'] = bcl2fastq_info
                # Store info on cellranger package
                ap.metadata['cellranger_software'] = cellranger_software_info
                # Put a copy of sample sheet in the log directory
                shutil.copy(sample_sheet, ap.log_dir)
                # Determine output directory absolute path
                output_dir = ap.params.unaligned_dir
                if not os.path.isabs(output_dir):
                    output_dir = os.path.join(ap.analysis_dir, output_dir)
                # Run cellranger mkfastq
                exit_code = run_cellranger_mkfastq(
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    output_dir=output_dir,
                    lanes=(None if lanes is None else ','.join(
                        [str(l) for l in lanes])),
                    bases_mask=bases_mask,
                    cellranger_exe=cellranger,
                    cellranger_jobmode=cellranger_jobmode,
                    cellranger_maxjobs=cellranger_maxjobs,
                    cellranger_mempercore=cellranger_mempercore,
                    cellranger_jobinterval=cellranger_jobinterval,
                    cellranger_localcores=cellranger_localcores,
                    cellranger_localmem=cellranger_localmem,
                    working_dir=ap.analysis_dir,
                    log_dir=ap.log_dir)
            except Exception as ex:
                raise Exception("'cellranger mkfastq' stage failed: "
                                "'%s'" % ex)
            # Turn off barcode analysis
            analyse_barcodes = False
        elif protocol == '10x_chromium_sc_atac':
            # 10xGenomics Chromium scATAC-seq
            exit_code = bcl_to_fastq_10x_chromium_sc_atac(
                ap,
                output_dir=ap.params.unaligned_dir,
                sample_sheet=sample_sheet,
                primary_data_dir=primary_data_dir,
                lanes=lanes,
                bases_mask=bases_mask,
                cellranger_jobmode=cellranger_jobmode,
                cellranger_maxjobs=cellranger_maxjobs,
                cellranger_mempercore=cellranger_mempercore,
                cellranger_jobinterval=cellranger_jobinterval,
                cellranger_localcores=cellranger_localcores,
                cellranger_localmem=cellranger_localmem,
                log_dir=ap.log_dir)
            # Turn off barcode analysis
            analyse_barcodes = False
        else:
            # Unknown protocol
            raise Exception("Unknown protocol '%s'" % protocol)
        # Check the outputs
        if exit_code != 0:
            raise Exception("Fastq generation finished with error: "
                            "exit code %d" % exit_code)
        if not verify_fastq_generation(
                ap, lanes=lanes, include_sample_dir=verify_include_sample_dir):
            # Check failed
            logger.error("Failed to verify output Fastqs against "
                         "sample sheet")
            # Try to load the data from unaligned dir
            try:
                illumina_data = IlluminaData.IlluminaData(
                    ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir)
            except IlluminaData.IlluminaDataError as ex:
                raise Exception("Unable to load data from %s: %s" %
                                (ap.params.unaligned_dir, ex))
            # Generate a list of missing Fastqs
            missing_fastqs = IlluminaData.list_missing_fastqs(
                illumina_data,
                sample_sheet,
                include_sample_dir=verify_include_sample_dir)
            assert (len(missing_fastqs) > 0)
            missing_fastqs_file = os.path.join(ap.log_dir,
                                               "missing_fastqs.log")
            print "Writing list of missing Fastq files to %s" % \
                missing_fastqs_file
            with open(missing_fastqs_file, 'w') as fp:
                for fq in missing_fastqs:
                    fp.write("%s\n" % fq)
            # Create empty FASTQs
            if create_empty_fastqs is None:
                try:
                    create_empty_fastqs = \
                        ap.settings.platform[ap.metadata.platform].\
                        create_empty_fastqs
                except (KeyError, AttributeError):
                    pass
            if create_empty_fastqs is None:
                create_empty_fastqs = \
                    ap.settings.bcl2fastq.create_empty_fastqs
            if create_empty_fastqs:
                logger.warning("Making 'empty' placeholder Fastqs")
                for fq in missing_fastqs:
                    fastq = os.path.join(ap.analysis_dir,
                                         ap.params.unaligned_dir, fq)
                    print "-- %s" % fastq
                    if not os.path.exists(os.path.dirname(fastq)):
                        mkdirs(os.path.dirname(fastq))
                    with gzip.GzipFile(filename=fastq, mode='wb') as fp:
                        fp.write('')
            else:
                raise Exception("Fastq generation failed to produce "
                                "expected outputs")
    # Generate statistics
    if generate_stats:
        fastq_statistics(ap,
                         stats_file=stats_file,
                         per_lane_stats_file=per_lane_stats_file,
                         unaligned_dir=ap.params.unaligned_dir,
                         nprocessors=nprocessors,
                         runner=runner)
    # Run barcode analysis
    if analyse_barcodes:
        # Determine output directory
        if barcode_analysis_dir is not None:
            ap.params['barcode_analysis_dir'] = barcode_analysis_dir
        elif ap.params.barcode_analysis_dir is None:
            ap.params['barcode_analysis_dir'] = 'barcode_analysis'
        barcode_analysis_dir = ap.params.barcode_analysis_dir
        if not os.path.isabs(barcode_analysis_dir):
            barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                                barcode_analysis_dir)
        # Report title
        title = "Barcode analysis for %s" % ap.metadata.run_name
        # Log file
        log_file = os.path.join(ap.log_dir, "analyse_barcodes.log")
        # Set up runner
        if runner is None:
            runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        # Get scheduler parameters
        max_jobs = ap.settings.general.max_concurrent_jobs
        poll_interval = ap.settings.general.poll_interval
        # Create and run barcode analysis pipeline
        barcode_analysis = AnalyseBarcodes(
            os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir))
        barcode_analysis.run(barcode_analysis_dir,
                             title=title,
                             lanes=lanes,
                             sample_sheet=sample_sheet,
                             log_file=log_file,
                             runner=runner,
                             max_jobs=max_jobs,
                             poll_interval=poll_interval,
                             verbose=False)
    # Make a 'projects.info' metadata file
    if lanes:
        ap.update_project_metadata_file()
    else:
        ap.make_project_metadata_file()
    # Remove primary data
    if remove_primary_data:
        remove_primary_data(ap)