def get_bases_mask(run_info_xml, sample_sheet_file=None): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads), and optionally updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: (optional) path to sample sheet file Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask if sample_sheet_file is not None: # Update bases mask from sample sheet example_barcode = IlluminaData.samplesheet_index_sequence( IlluminaData.SampleSheet(sample_sheet_file).data[0]) if example_barcode is None: example_barcode = "" if barcode_is_10xgenomics(example_barcode): print "Bases mask: barcode is 10xGenomics sample set ID" else: bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % \ (bases_mask,example_barcode) return bases_mask
def get_bases_mask(run_info_xml, sample_sheet_file): """ Get bases mask string Generates initial bases mask based on data in RunInfo.xml (which says how many reads there are, how many cycles in each read, and which are index reads). Then updates this using the barcode information in the sample sheet file. Arguments: run_info_xml: name and path of RunInfo.xml file from the sequencing run sample_sheet_file: name and path of sample sheet file. Returns: Bases mask string e.g. 'y101,I6'. """ # Get initial bases mask bases_mask = IlluminaData.IlluminaRunInfo(run_info_xml).bases_mask print "Bases mask: %s (from RunInfo.xml)" % bases_mask # Update bases mask from sample sheet example_barcode = IlluminaData.get_casava_sample_sheet( sample_sheet_file)[0]['Index'] bases_mask = IlluminaData.fix_bases_mask(bases_mask, example_barcode) print "Bases mask: %s (updated for barcode sequence '%s')" % ( bases_mask, example_barcode) return bases_mask
def main(): p = optparse.OptionParser( usage="%prog [OPTIONS] ILLUMINA_RUN_DIR OUTPUT_DIR [ SAMPLE_SHEET ]", version="%prog "+__version__, description="Wrapper to automate the Illumina bcl to fastq " "conversion process. It will either run the CASAVA/bcl2fastq v1.8 " "configureBclToFastq.pl/make pipeline or bcl2fastq v2 directly, " "depending on which software package is detected. ILLUMINA_RUN_DIR " "is the top-level directory of the Illumina run to be processed; " "output will be written to OUTPUT_DIR. Optionally a SAMPLE_SHEET " "file can also be specified, otherwise the SampleSheet.csv file in " "the BaseCalls directory will be used (if present).") # Options common to both bcl2fastq/bcl2fastq v2 p.add_option('--nmismatches',action="store",dest="nmismatches", default=None, help="set number of mismatches to allow; recommended " "values are 0 for samples without multiplexing, 1 for " "multiplexed samples with tags of length 6 or longer " "(CASAVA/bcl2fastq v1.8 --mismatches option, bcl2fastq " "v2 --barcode-mismatches option)") p.add_option('--use-bases-mask',action="store",dest="bases_mask", default=None, help="specify a bases-mask string to tell CASAVA how " "to use each cycle (the supplied value is passed " "to the --use-bases-mask option)") p.add_option('--nprocessors',action="store",dest="nprocessors", default=None, help="set the number of processors to use (defaults to " "1; for CASAVA/bcl2fastq v1.8 this is passed to the " "-j option of the 'make' step after running " "configureBcltoFastq.pl, for bcl2fastq v2 this is " "the maximum number of CPUs that should be used by " "the -r, -d, -p and -w options)") p.add_option('--ignore-missing-bcl',action="store_true", dest="ignore_missing_bcl",default=False, help="interpret missing bcl files as no call " "(CASAVA/bcl2fastq v1.8 --ignore-missing-bcl option, " "bcl2fastq v2 --ignore-missing-bcls option)") p.add_option('--bcl2fastq_path',action="store", dest="bcl2fastq_path",default=None, help="explicitly specify the path to the CASAVA or " "bcl2fastq software to use.") # CASAVA/bcl2fastq 1.8.* only casava = optparse.OptionGroup(p,'CASAVA/bcl2fastq v1.8 only') casava.add_option('--ignore-missing-stats',action="store_true", dest="ignore_missing_stats",default=False, help="fill in with zeroes when *.stats files are missing " "(see the CASAVA user guide for details of how " "--ignore-missing-stats works)") casava.add_option('--ignore-missing-control',action="store_true", dest="ignore_missing_control",default=False, help="interpret missing control files as not-set control " "bits (see the CASAVA user guide for details of how " "--ignore-missing-control works)") p.add_option_group(casava) # bcl2fastq 2 only bcl2fastq2 = optparse.OptionGroup(p,'bcl2fastq v2 only') bcl2fastq2.add_option('--no-lane-splitting',action="store_true", dest="no_lane_splitting",default=False, help="Don't split output FASTQ files by lane") # Adapter trimming (bcl2fastq 2 only) adapter_trimming = optparse.OptionGroup(p,'Adapter trimming (bcl2fastq v2 only)') adapter_trimming.add_option('--minimum-trimmed-read-length',action="store", dest="minimum_trimmed_read_length",default=35, help="Minimum read length after adapter " "trimming. bcl2fastq trims the adapter from " "the read down to this value; if there is more " "adapter match below this length then those " "bases are masked not trimmed (i.e. replaced " "by N rather than removed) (default: 35)") adapter_trimming.add_option('--mask-short-adapter-reads',action="store", dest="mask_short_adapter_reads",default=22, help="minimum length of unmasked bases that " "a read can be after adapter trimming; reads " "with fewer ACGT bases will be completely " "masked with Ns (default: 22)") p.add_option_group(adapter_trimming) # Advanced options advanced = optparse.OptionGroup(p,'Advanced options') advanced.add_option('--platform',action="store", dest="platform",default=None, help="Explicitly specify platform; only use this if " "the platform can't be read from the instrument name") p.add_option_group(advanced) options,args = p.parse_args() if not (2 <= len(args) <=3): p.error("input is an input directory, output directory and an " "optional sample sheet") # Acquire bcl2fastq software bcl2fastq = available_bcl2fastq_versions(paths=(options.bcl2fastq_path,)) if not bcl2fastq: logging.error("No bcl2fastq software found") return 1 else: bcl2fastq_exe = bcl2fastq[0] # Determine bcl2fastq version bcl2fastq_info = bcl_to_fastq_info(bcl2fastq_exe) if bcl2fastq_info[0] is None: logging.error("No bcl2fastq software found") return 1 print "Using conversion software from %s" % os.path.dirname( bcl2fastq_info[0]) # Return with error code if no version detected bcl2fastq_package = bcl2fastq_info[1] bcl2fastq_version = bcl2fastq_info[2] if bcl2fastq_version is None: logging.error("Cannot determine bcl2fastq software version") return 1 print "Package: %s" % bcl2fastq_package print "Version: %s" % bcl2fastq_version known_version = None for version in BCL2FASTQ_VERSIONS: if bcl2fastq_version.startswith("%s." % version): known_version = version break if known_version is None: # Unimplemented version logging.error("Don't know how to run bcl2fastq version %s" % bcl2fastq_version) return 1 # Locate run directory (and strip any trailing slash) illumina_run_dir = os.path.abspath(args[0].rstrip(os.sep)) if not os.path.isdir(illumina_run_dir): logging.error("%s: doesn't exist or is not a directory" % illumina_run_dir) sys.exit(1) illumina_run = IlluminaData.IlluminaRun(illumina_run_dir, options.platform) # Output directory output_dir = os.path.abspath(args[1].rstrip(os.sep)) # Sample sheet if len(args) == 3: sample_sheet = os.path.abspath(args[2]) else: sample_sheet = illumina_run.sample_sheet_csv # Bases mask if options.bases_mask is not None: bases_mask = options.bases_mask else: bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask # Report settings print "Illumina run directory : %s" % illumina_run.run_dir print "Basecalls directory : %s" % illumina_run.basecalls_dir print "Platform : %s" % illumina_run.platform print "Bcl file extension : %s" % illumina_run.bcl_extension print "SampleSheet.csv file : %s" % sample_sheet print "Output dir : %s" % output_dir print "Nmismatches : %s" % options.nmismatches print "Bases mask : %s" % bases_mask print "Nprocessors : %s" % options.nprocessors print "Ignore missing bcl : %s" % options.ignore_missing_bcl if known_version == '1.8': print "Ignore missing stats : %s" % options.ignore_missing_stats print "Ignore missing control : %s" % options.ignore_missing_control elif known_version in ('2.17','2.20',): print "No lane splitting : %s" % options.no_lane_splitting print "Min trimmed read length : %s" % \ options.minimum_trimmed_read_length print "Mask short adapter reads: %s" % \ options.mask_short_adapter_reads # Run bclToFastq conversion based on the version if known_version in ('1.8',): # 1.8.* pipeline status = run_bcl2fastq_1_8( illumina_run.basecalls_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, nprocessors=options.nprocessors, force=True, ignore_missing_bcl=options.ignore_missing_bcl, ignore_missing_stats=options.ignore_missing_stats, ignore_missing_control=options.ignore_missing_control ) elif known_version in ('2.17',): # bcl2fastq 2.17.* if options.nprocessors is not None: # Explicitly set number of threads for each stage nprocessors=int(options.nprocessors) loading_threads=min(4,nprocessors) writing_threads=min(4,nprocessors) demultiplexing_threads=max(int(float(nprocessors)*0.2), nprocessors) processing_threads=nprocessors print "Explicitly setting number of threads for each stage:" print "Loading (-r) : %d" % loading_threads print "Demultiplexing (-d): %d" % demultiplexing_threads print "Processing (-p) : %d" % processing_threads print "Writing (-w) : %d" % writing_threads else: # Use the defaults loading_threads = None demultiplexing_threads = None processing_threads = None writing_threads = None # Run the bcl to fastq conversion status = run_bcl2fastq_2_17( illumina_run.run_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, ignore_missing_bcl=options.ignore_missing_bcl, no_lane_splitting=options.no_lane_splitting, minimum_trimmed_read_length=options.minimum_trimmed_read_length, mask_short_adapter_reads=options.mask_short_adapter_reads, loading_threads=loading_threads, demultiplexing_threads=demultiplexing_threads, processing_threads=processing_threads, writing_threads=writing_threads ) elif known_version in ('2.20',): # bcl2fastq 2.20.* if options.nprocessors is not None: # Explicitly set number of threads for each stage nprocessors=int(options.nprocessors) loading_threads=min(4,nprocessors) writing_threads=min(4,nprocessors) processing_threads=nprocessors print "Explicitly setting number of threads for each stage:" print "Loading (-r) : %d" % loading_threads print "Processing (-p) : %d" % processing_threads print "Writing (-w) : %d" % writing_threads else: # Use the defaults loading_threads = None processing_threads = None writing_threads = None # Run the bcl to fastq conversion status = run_bcl2fastq_2_20( illumina_run.run_dir, sample_sheet, output_dir=output_dir, mismatches=options.nmismatches, bases_mask=options.bases_mask, ignore_missing_bcl=options.ignore_missing_bcl, no_lane_splitting=options.no_lane_splitting, minimum_trimmed_read_length=options.minimum_trimmed_read_length, mask_short_adapter_reads=options.mask_short_adapter_reads, loading_threads=loading_threads, processing_threads=processing_threads, writing_threads=writing_threads ) print "bclToFastq returncode: %s" % status if status != 0: logging.error("bclToFastq failure") return status
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, lanes=None, ignore_missing_bcl=False, ignore_missing_stats=False, skip_rsync=False, remove_primary_data=False, nprocessors=None, require_bcl2fastq_version=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, skip_fastq_generation=False, only_fetch_primary_data=False, create_empty_fastqs=None, runner=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False): """Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) nprocessors (int) : number of processors to run bclToFastq.py with ignore_missing_bcl (bool): if True then run bcl2fastq with --ignore-missing-bcl ignore_missing_stats (bool): if True then run bcl2fastq with --ignore-missing-stats skip_rsync (bool): if True then don't rsync primary data at the start of bcl2fastq conversion remove_primary_data (bool): if True then remove primary data at the end of bcl2fastq conversion (default is to keep it) generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs require_bcl2fastq_version (str): (optional) specify bcl2fastq version to use. Should be a string of the form '1.8.4' or '>2.0'. Set to None to automatically determine required bcl2fastq version. bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis skip_fastq_generation (bool): if True then don't perform fastq generation only_fetch_primary_data (bool): if True then fetch primary data, don't do anything else create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) """ # Report protocol print "Protocol : %s" % protocol if protocol not in MAKE_FASTQS_PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS]))) # Unaligned dir if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print "Output dir : %s" % ap.params.unaligned_dir # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print "Source sample sheet : %s" % ap.params.sample_sheet # Check requested lanes are actually present print "Lanes : %s" % ('all' if lanes is None else ','.join( [str(l) for l in lanes])) if lanes is not None: s = IlluminaData.SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Make a temporary sample sheet if lanes: lanes_id = ".L%s" % ''.join([str(l) for l in lanes]) else: lanes_id = "" sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S"))) make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes) # Check the temporary sample sheet print "Checking temporary sample sheet" invalid_barcodes = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_barcodes() if invalid_barcodes: logger.error("Invalid barcodes detected") for line in invalid_barcodes: logger.critical("%s" % line) invalid_characters = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_characters() if invalid_characters: logger.critical("Invalid non-printing/non-ASCII characters " "detected") if invalid_barcodes or invalid_characters: raise Exception("Errors detected in generated sample sheet") # Adjust verification settings for 10xGenomics Chromium SC # data if necessary verify_include_sample_dir = False if has_chromium_sc_indices(sample_sheet): if protocol in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): # Force inclusion of sample-name subdirectories # when verifying Chromium SC data print "Sample sheet includes Chromium SC indices" verify_include_sample_dir = True else: # Chromium SC indices detected but not using # 10x_chromium_sc protocol raise Exception("Detected 10xGenomics Chromium SC indices " "in generated sample sheet but protocol " "'%s' has been specified; use an " "appropriate '10x_...' protocol for these " "indices" % protocol) # Check for pre-existing Fastq outputs if verify_fastq_generation(ap, unaligned_dir=ap.params.unaligned_dir, lanes=lanes, include_sample_dir=verify_include_sample_dir): print "Expected Fastq outputs already present" skip_rsync = True skip_fastq_generation = True # Check if there's anything to do if (skip_rsync and skip_fastq_generation) and \ not (generate_stats or analyse_barcodes): print "Nothing to do" return # Log dir log_dir = 'make_fastqs' if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Fetch primary data if not skip_rsync and not ap.params.acquired_primary_data: if get_primary_data(ap) != 0: logger.error("Failed to acquire primary data") raise Exception("Failed to acquire primary data") else: ap.params['acquired_primary_data'] = True if only_fetch_primary_data: return # Deal with platform information if not platform: platform = ap.metadata.platform # Do fastq generation using the specified protocol if not skip_fastq_generation: # Set primary data location and report info primary_data_dir = os.path.join(ap.params.primary_data_dir, os.path.basename(ap.params.data_dir)) print "Primary data dir : %s" % primary_data_dir try: illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=platform) except IlluminaData.IlluminaDataPlatformError as ex: logger.critical("Error loading primary data: %s" % ex) if platform is None: logger.critical("Try specifying platform using --platform?") else: logger.critical("Check specified platform is valid (or " "omit --platform") raise Exception("Error determining sequencer platform") print "Platform : %s" % illumina_run.platform print "Bcl format : %s" % illumina_run.bcl_extension # Set platform in metadata ap.metadata['platform'] = illumina_run.platform # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask print "Bases mask setting : %s" % bases_mask if protocol not in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): if bases_mask == "auto": print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask(illumina_run.runinfo_xml, sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Do fastq generation according to protocol if protocol == 'icell8': # ICell8 data # Update bcl2fastq settings appropriately print "Updating read trimming and masking for ICell8" minimum_trimmed_read_length = 21 mask_short_adapter_reads = 0 # Reset the default bases mask bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask bases_mask = get_icell8_bases_mask(bases_mask, sample_sheet=sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Switch to standard protocol protocol = 'standard' if protocol == 'standard': # Standard protocol try: exit_code = bcl_to_fastq( ap, unaligned_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, require_bcl2fastq=require_bcl2fastq_version, bases_mask=bases_mask, ignore_missing_bcl=ignore_missing_bcl, ignore_missing_stats=ignore_missing_stats, no_lane_splitting=no_lane_splitting, minimum_trimmed_read_length=minimum_trimmed_read_length, mask_short_adapter_reads=mask_short_adapter_reads, nprocessors=nprocessors, runner=runner) except Exception as ex: raise Exception("Bcl2fastq stage failed: '%s'" % ex) elif protocol == '10x_chromium_sc': # 10xGenomics Chromium SC if bases_mask == 'auto': bases_mask = None try: # Check we have cellranger cellranger = find_program('cellranger') if not cellranger: raise Exception("No cellranger package found") cellranger_software_info = cellranger_info(cellranger) print "Using cellranger %s: %s" % \ (cellranger_software_info[-1], cellranger) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_software_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, ap.log_dir) # Determine output directory absolute path output_dir = ap.params.unaligned_dir if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Run cellranger mkfastq exit_code = run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join( [str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=ap.analysis_dir, log_dir=ap.log_dir) except Exception as ex: raise Exception("'cellranger mkfastq' stage failed: " "'%s'" % ex) # Turn off barcode analysis analyse_barcodes = False elif protocol == '10x_chromium_sc_atac': # 10xGenomics Chromium scATAC-seq exit_code = bcl_to_fastq_10x_chromium_sc_atac( ap, output_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, lanes=lanes, bases_mask=bases_mask, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, log_dir=ap.log_dir) # Turn off barcode analysis analyse_barcodes = False else: # Unknown protocol raise Exception("Unknown protocol '%s'" % protocol) # Check the outputs if exit_code != 0: raise Exception("Fastq generation finished with error: " "exit code %d" % exit_code) if not verify_fastq_generation( ap, lanes=lanes, include_sample_dir=verify_include_sample_dir): # Check failed logger.error("Failed to verify output Fastqs against " "sample sheet") # Try to load the data from unaligned dir try: illumina_data = IlluminaData.IlluminaData( ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir) except IlluminaData.IlluminaDataError as ex: raise Exception("Unable to load data from %s: %s" % (ap.params.unaligned_dir, ex)) # Generate a list of missing Fastqs missing_fastqs = IlluminaData.list_missing_fastqs( illumina_data, sample_sheet, include_sample_dir=verify_include_sample_dir) assert (len(missing_fastqs) > 0) missing_fastqs_file = os.path.join(ap.log_dir, "missing_fastqs.log") print "Writing list of missing Fastq files to %s" % \ missing_fastqs_file with open(missing_fastqs_file, 'w') as fp: for fq in missing_fastqs: fp.write("%s\n" % fq) # Create empty FASTQs if create_empty_fastqs is None: try: create_empty_fastqs = \ ap.settings.platform[ap.metadata.platform].\ create_empty_fastqs except (KeyError, AttributeError): pass if create_empty_fastqs is None: create_empty_fastqs = \ ap.settings.bcl2fastq.create_empty_fastqs if create_empty_fastqs: logger.warning("Making 'empty' placeholder Fastqs") for fq in missing_fastqs: fastq = os.path.join(ap.analysis_dir, ap.params.unaligned_dir, fq) print "-- %s" % fastq if not os.path.exists(os.path.dirname(fastq)): mkdirs(os.path.dirname(fastq)) with gzip.GzipFile(filename=fastq, mode='wb') as fp: fp.write('') else: raise Exception("Fastq generation failed to produce " "expected outputs") # Generate statistics if generate_stats: fastq_statistics(ap, stats_file=stats_file, per_lane_stats_file=per_lane_stats_file, unaligned_dir=ap.params.unaligned_dir, nprocessors=nprocessors, runner=runner) # Run barcode analysis if analyse_barcodes: # Determine output directory if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Report title title = "Barcode analysis for %s" % ap.metadata.run_name # Log file log_file = os.path.join(ap.log_dir, "analyse_barcodes.log") # Set up runner if runner is None: runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) # Get scheduler parameters max_jobs = ap.settings.general.max_concurrent_jobs poll_interval = ap.settings.general.poll_interval # Create and run barcode analysis pipeline barcode_analysis = AnalyseBarcodes( os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir)) barcode_analysis.run(barcode_analysis_dir, title=title, lanes=lanes, sample_sheet=sample_sheet, log_file=log_file, runner=runner, max_jobs=max_jobs, poll_interval=poll_interval, verbose=False) # Make a 'projects.info' metadata file if lanes: ap.update_project_metadata_file() else: ap.make_project_metadata_file() # Remove primary data if remove_primary_data: remove_primary_data(ap)