def test_set_cell_count_fails_for_project_with_no_metadata(self): """ set_cell_count_for_project: raises exception for project with no metadata """ # Set up mock project project_dir = self._make_mock_analysis_project(None, None) # Add metrics_summary.csv counts_dir = os.path.join(project_dir, "qc", "cellranger_count", "5.0.1", "refdata-gex-GRCh38-2020-A", "PJB1", "outs") mkdirs(counts_dir) metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv") with open(metrics_summary_file, 'wt') as fp: fp.write(METRICS_SUMMARY) # Add QC info file with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp: fp.write( """Cellranger reference datasets\t/data/refdata-gex-GRCh38-2020-A Cellranger version\t5.0.1 """) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Attempting to update the cell counts should raise # NotImplementedError self.assertRaises(NotImplementedError, set_cell_count_for_project, project_dir) # Check cell count wasn't updated self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None)
def test_set_cell_count_project_missing_library_type(self): """ set_cell_count_for_project: test for scRNA-seq when library not set """ # Set up mock project with library type not set project_dir = self._make_mock_analysis_project( "10xGenomics Chromium 3'v3", None) # Add metrics_summary.csv counts_dir = os.path.join(project_dir, "qc", "cellranger_count", "5.0.1", "refdata-gex-GRCh38-2020-A", "PJB1", "outs") mkdirs(counts_dir) metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv") with open(metrics_summary_file, 'w') as fp: fp.write(METRICS_SUMMARY) # Add QC info file with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp: fp.write( """Cellranger reference datasets\t/data/refdata-gex-GRCh38-2020-A Cellranger version\t5.0.1 """) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Update the cell counts print("Updating number of cells") set_cell_count_for_project(project_dir) # Check updated cell count self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, 2272)
def test_set_cell_count_for_multiome_gex_project(self): """ set_cell_count_for_project: test for single cell multiome GEX """ # Set up mock project project_dir = self._make_mock_analysis_project( "10xGenomics Single Cell Multiome", "GEX") # Add metrics_summary.csv counts_dir = os.path.join(project_dir, "qc", "cellranger_count", "1.0.0", "refdata-cellranger-arc-GRCh38-2020-A", "PJB1", "outs") mkdirs(counts_dir) summary_file = os.path.join(counts_dir, "summary.csv") with open(summary_file, 'w') as fp: fp.write(MULTIOME_SUMMARY) # Add QC info file with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp: fp.write( """Cellranger reference datasets\t/data/refdata-cellranger-arc-GRCh38-2020-A Cellranger version\t1.0.0 """) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Update the cell counts print("Updating number of cells") set_cell_count_for_project(project_dir) # Check updated cell count self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, 744)
def test_set_cell_count_for_atac_project_2_0_0(self): """ set_cell_count_for_project: test for scATAC-seq (Cellranger ATAC 2.0.0) """ # Set up mock project project_dir = self._make_mock_analysis_project( "10xGenomics Single Cell ATAC", "scATAC-seq") # Add metrics_summary.csv counts_dir = os.path.join( project_dir, "qc", "cellranger_count", "2.0.0", "refdata-cellranger-atac-GRCh38-2020-A-2.0.0", "PJB1", "outs") mkdirs(counts_dir) summary_file = os.path.join(counts_dir, "summary.csv") with open(summary_file, 'w') as fp: fp.write(ATAC_SUMMARY_2_0_0) # Add QC info file with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp: fp.write( """Cellranger reference datasets\t/data/refdata-cellranger-atac-GRCh38-2020-A-2.0.0 Cellranger version\t2.0.0 """) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Update the cell counts print("Updating number of cells") set_cell_count_for_project(project_dir) # Check updated cell count self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, 3582)
def test_set_cell_count_for_cellplex_project(self): """ set_cell_count_for_project: test for multiplexed data (CellPlex) """ # Set up mock project project_dir = self._make_mock_analysis_project( "10xGenomics Chromium 3'v3", "CellPlex") # Build mock cellranger multi output directory multi_dir = os.path.join(project_dir, "qc", "cellranger_multi", "6.0.0", "refdata-cellranger-gex-GRCh38-2020-A", "outs") mkdirs(multi_dir) for sample in ( "PBA", "PBB", ): sample_dir = os.path.join(multi_dir, "per_sample_outs", sample) mkdirs(sample_dir) summary_file = os.path.join(sample_dir, "metrics_summary.csv") with open(summary_file, 'wt') as fp: fp.write(CELLPLEX_METRICS_SUMMARY) web_summary = os.path.join(sample_dir, "web_summary.html") with open(web_summary, 'wt') as fp: fp.write("Placeholder for web_summary.html\n") # Add QC info file with open(os.path.join(project_dir, "qc", "qc.info"), 'wt') as fp: fp.write( """Cellranger reference datasets\t/data/refdata-cellranger-gex-GRCh38-2020-A Cellranger version\t6.0.0 """) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Update the cell counts print("Updating number of cells") set_cell_count_for_project(project_dir) # Check updated cell count self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, 10350)
def test_set_cell_count_project_missing_library_type_no_subdirs(self): """ set_cell_count_for_project: test for scRNA-seq when library not set (old-style output) """ # Set up mock project with library type not set project_dir = self._make_mock_analysis_project( "10xGenomics Chromium 3'v3", None) # Add metrics_summary.csv counts_dir = os.path.join(project_dir, "qc", "cellranger_count", "PJB1", "outs") mkdirs(counts_dir) metrics_summary_file = os.path.join(counts_dir, "metrics_summary.csv") with open(metrics_summary_file, 'w') as fp: fp.write(METRICS_SUMMARY) # Check initial cell count print("Checking number of cells") self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, None) # Update the cell counts print("Updating number of cells") set_cell_count_for_project(project_dir) # Check updated cell count self.assertEqual( AnalysisProject("PJB1", project_dir).info.number_of_cells, 2272)
def build_fastq_path_dir(project_dir): """ Create directory mimicking output from cellranger mkfastq This function creates and populates a 'cellranger mkfastq' style 'fastq_path' directory from an autoprocess analysis project, which can then be used as input to 'cellranger count'. The new directory will be called 'cellranger_fastq_path' and will created in the project directory, and will be populated by links to the Fastq files in the project. Arguments: project_dir (str): path to the project directory in which to create the 'fastq_path' directory Returns: String: path to the 'cellranger_fastq_path' directory. """ project = AnalysisProject(os.path.basename(project_dir.rstrip(os.sep)), os.path.abspath(project_dir)) fastq_path_dir = os.path.join(project.dirn, "cellranger_fastq_path") mkdirs(fastq_path_dir) mkdirs(os.path.join(fastq_path_dir,"Reports")) mkdirs(os.path.join(fastq_path_dir,"Stats")) fq_dir = os.path.join(fastq_path_dir,project.name) mkdirs(fq_dir) for fastq in project.fastqs: print fastq link_name = os.path.join(fq_dir,os.path.basename(fastq)) if os.path.exists(link_name): logger.warning("%s: already exists" % link_name) continue target = os.path.relpath(fastq,fq_dir) logger.debug("Linking: %s -> %s" % (link_name,target)) os.symlink(target,link_name) return fastq_path_dir
def cellranger_mkfastq(samplesheet, primary_data_dir, output_dir, lanes=None, cellranger_jobmode='local', cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, log_dir=None, dry_run=False, project_metadata_file='projects.info'): """ Wrapper for running 'cellranger mkfastq' Runs the 10xGenomics 'cellranger mkfastq' command to generate Fastqs from bcl files for Chromium single-cell data. Arguments: sample_sheet (str): path to input samplesheet with 10xGenomics barcode indices primary_data_dir (str): path to the top-level directory holding the sequencing data output_dir (str): path to the output directory lanes (str): optional, specify the subset of lanes to process (default is to process all lanes in the run) cellranger_jobmode (str): specify the job mode to pass to cellranger (default: "local") cellranger_maxjobs (int): specify the maximum number of jobs to pass to cellranger (default: None) cellranger_mempercore (int): specify the memory per core (in Gb) to pass to cellranger (default: None) cellranger_jobinterval (int): specify the interval between launching jobs (in ms) to pass to cellranger (default: None) cellranger_localcores (int): maximum number of cores cellranger can request in jobmode 'local' cellranger_localmem (int): maximum memory cellranger can request in jobmode 'local' log_dir (str): path to a directory to write logs (default: current working directory) dry_run (bool): if True then only report actions that would be performed but don't run anything project_metadata_file (str): name of project metadata file to create/update with information on projects generated by cellranger (default: projects.info) Returns: Integer: exit code from the cellranger command. """ # Make a log directory if not dry_run: if log_dir is None: log_dir = os.getcwd() log_dir = get_numbered_subdir("cellranger_mkfastq", parent_dir=log_dir, full_path=True) mkdirs(log_dir) # Run cellranger mkfastq retval = run_cellranger_mkfastq( samplesheet, primary_data_dir, output_dir, lanes=lanes, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, log_dir=log_dir, dry_run=dry_run) if not dry_run: # Update the project metadata file update_project_metadata(output_dir, project_metadata_file) return retval
def run_cellranger_count(fastq_dir, transcriptome, cellranger_jobmode='sge', cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, max_jobs=4, log_dir=None, dry_run=False, summary_only=True): """ Wrapper for running 'cellranger count' Runs the 10xGenomics 'cellranger count' command to perform single library analysis on Fastqs from Chromium single-cell samples. If the supplied 'fastq_dir' is a 'cellranger mkfastq' or 'bcl2fastq' output directory then the analysis will be run for each of the projects. Arguments: fastq_dir (str): path of the 'fastq_path' folder from 'cellranger mkfastq', or the output folder from 'bcl2fastq' (or with a similar structure), or any folder containing Fastq files transcriptome (str): path to the cellranger compatible transcriptome reference data directory cellranger_jobmode (str): specify the job mode to pass to cellranger (default: None) cellranger_maxjobs (int): specify the maximum number of jobs to pass to cellranger (default: None) cellranger_mempercore (int): specify the memory per core (in Gb) to pass to cellranger (default: None) cellranger_jobinterval (int): specify the interval between launching jobs (in ms) to pass to cellranger (default: None) max_jobs (int): log_dir (str): path to a directory to write logs (default: current working directory) dry_run (bool): if True then only report actions that would be performed but don't run anything summary_only (bool): if True then only collect the output 'web_summary.html' and 'metrics_summary.csv' files, otherwise copy all outputs (warning: this can be very large) Returns: Integer: exit code from the cellranger command. """ # Input data sample_names = {} try: illumina_data = IlluminaData(os.getcwd(), unaligned_dir=fastq_dir) for project in illumina_data.projects: sample_names[project.name] = [] for sample in project.samples: sample_names[project.name].append(sample.name) except IlluminaDataError: logger.critical("Couldn't load data from '%s'" % fastq_dir) return 1 print "Samples: %s" % sample_names projects = sample_names.keys() # Set up a scheduler sched_reporter = SchedulerReporter( job_start= "SCHEDULER: Started #%(job_number)d: %(job_name)s:\n-- %(command)s", job_end="SCHEDULER: Finished #%(job_number)d: %(job_name)s") sched_reporter = SchedulerReporter() sched = SimpleScheduler(max_concurrent=max_jobs, reporter=sched_reporter) sched.start() # Make a log directory if not dry_run: if log_dir is None: log_dir = os.getcwd() log_dir = get_numbered_subdir("cellranger_count", parent_dir=log_dir, full_path=True) mkdirs(log_dir) # Submit the cellranger count jobs jobs = [] for project in projects: print "Project: %s" % project for sample in sample_names[project]: print "Sample: %s" % sample # Check if outputs already exist count_dir = os.path.abspath( os.path.join(project, "cellranger_count", sample, "outs")) if os.path.isdir(count_dir): print "-- %s: outputs exist, nothing to do" % sample continue else: print "-- %s: setting up cellranger count" % sample # Set up job for this sample work_dir = os.path.abspath("tmp.cellranger_count.%s.%s" % (project, sample)) mkdirs(work_dir) cmd = Command("cellranger", "count", "--id", sample, "--fastqs", os.path.abspath(fastq_dir), "--sample", sample, "--transcriptome", transcriptome) add_cellranger_args(cmd, jobmode=cellranger_jobmode, mempercore=cellranger_mempercore, maxjobs=cellranger_maxjobs, jobinterval=cellranger_jobinterval) print "Running: %s" % cmd if not dry_run: job = sched.submit(cmd, name="cellranger_count.%s.%s" % (project, sample), log_dir=log_dir, wd=work_dir) jobs.append(job) sched.wait() sched.stop() # If dry run then stop here if dry_run: return 0 # Finished, check the exit status retval = 0 for job in jobs: retval += job.exit_code if retval != 0: logger.critical("One or more jobs finished with non-zero " "exit code") return retval # Handle outputs for project in projects: print "Project: %s" % project for sample in sample_names[project]: print "Sample: %s" % sample # Destination for count output count_dir = os.path.abspath( os.path.join(project, "cellranger_count", sample)) mkdirs(count_dir) # Copy the cellranger count outputs outs_dir = os.path.join( "tmp.cellranger_count.%s.%s" % (project, sample), sample, "outs") if not summary_only: # Collect all outputs print "Copying contents of %s to %s" % (outs_dir, count_dir) shutil.copytree(outs_dir, count_dir) else: # Only collect the web and csv summaries count_dir = os.path.join(count_dir, "outs") mkdirs(count_dir) for f in ("web_summary.html", "metrics_summary.csv"): path = os.path.join(outs_dir, f) if not os.path.exists(path): logger.warning("%s: not found in %s" % (f, outs_dir)) retval = 1 else: print "Copying %s from %s to %s" % (f, outs_dir, count_dir) shutil.copy(path, count_dir) # Stop if there was an error if retval != 0: logger.critical("Some cellranger count outputs are " "missing") return retval # Create a report and zip archive for each project pwd = os.getcwd() analysis_dir = os.path.basename(pwd) for project in projects: # Descend into project dir os.chdir(project) # Set up zip file report_zip = os.path.join("cellranger_count_report.%s.%s.zip" % (project, analysis_dir)) zip_file = ZipArchive(report_zip, prefix="cellranger_count_report.%s.%s" % (project, analysis_dir)) # Construct index page print "Making report for project %s" % project count_report = Document("%s: cellranger count" % project) count_report.add_css_rule(css_rules.QC_REPORT_CSS_RULES) summaries = count_report.add_section() summaries.add("Reports from cellranger count for each sample:") summary_links = List() for sample in sample_names[project]: # Link to summary for sample web_summary = os.path.join("cellranger_count", sample, "outs", "web_summary.html") print "Adding web summary (%s) for %s" % (web_summary, sample) summary_links.add_item(Link("%s" % sample, web_summary)) # Add to the zip file zip_file.add_file(web_summary) summaries.add(summary_links) # Write the report and add to the zip file html_file = "cellranger_count_report.html" count_report.write(html_file) zip_file.add_file(html_file) # Finish zip_file.close() os.chdir(pwd) # Done return retval
def make_fastqs(ap, protocol='standard', platform=None, unaligned_dir=None, sample_sheet=None, lanes=None, ignore_missing_bcl=False, ignore_missing_stats=False, skip_rsync=False, remove_primary_data=False, nprocessors=None, require_bcl2fastq_version=None, bases_mask=None, no_lane_splitting=None, minimum_trimmed_read_length=None, mask_short_adapter_reads=None, generate_stats=True, stats_file=None, per_lane_stats_file=None, analyse_barcodes=True, barcode_analysis_dir=None, skip_fastq_generation=False, only_fetch_primary_data=False, create_empty_fastqs=None, runner=None, cellranger_jobmode=None, cellranger_mempercore=None, cellranger_maxjobs=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, cellranger_ignore_dual_index=False): """Create and summarise FASTQ files Wrapper for operations related to FASTQ file generation and analysis. The operations are typically: - get primary data (BCL files) - run bcl-to-fastq conversion - generate statistics If the number of processors and the job runner are not explicitly specified then these are taken from the settings for the bcl2fastq and the statistics generation steps, which may differ from each other. However if either of these values are set explicitly then the same values will be used for both steps. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to create Fastqs for protocol (str): if set then specifies the protocol to use for fastq generation, otherwise use the 'standard' bcl2fastq protocol platform (str): if set then specifies the sequencing platform (otherwise platform will be determined from the primary data) unaligned_dir (str): if set then use this as the output directory for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless an alternative is already specified in the config file) sample_sheet (str): if set then use this as the input samplesheet lanes (list): (optional) specify a list of lane numbers to use in the processing; lanes not in the list will be excluded (default is to include all lanes) nprocessors (int) : number of processors to run bclToFastq.py with ignore_missing_bcl (bool): if True then run bcl2fastq with --ignore-missing-bcl ignore_missing_stats (bool): if True then run bcl2fastq with --ignore-missing-stats skip_rsync (bool): if True then don't rsync primary data at the start of bcl2fastq conversion remove_primary_data (bool): if True then remove primary data at the end of bcl2fastq conversion (default is to keep it) generate_stats (bool): if True then (re)generate statistics file for fastqs analyse_barcodes (bool): if True then (re)analyse barcodes for fastqs require_bcl2fastq_version (str): (optional) specify bcl2fastq version to use. Should be a string of the form '1.8.4' or '>2.0'. Set to None to automatically determine required bcl2fastq version. bases_mask (str): if set then use this as an alternative bases mask setting no_lane_splitting (bool): if True then run bcl2fastq with --no-lane-splitting minimum_trimmed_read_length (int): if set then specify minimum length for reads after adapter trimming (shorter reads will be padded with Ns to make them long enough) mask_short_adapter_reads (int): if set then specify the minimum length of ACGT bases that must be present in a read after adapter trimming for it not to be masked completely with Ns. stats_file (str): if set then use this as the name of the output per-fastq stats file. per_lane_stats_file (str): if set then use this as the name of the output per-lane stats file. barcode_analysis_dir (str): if set then specifies path to the output directory for barcode analysis skip_fastq_generation (bool): if True then don't perform fastq generation only_fetch_primary_data (bool): if True then fetch primary data, don't do anything else create_empty_fastqs (bool): if True then create empty 'placeholder' fastq files for any missing fastqs after bcl2fastq (must have completed with zero exit status) runner (JobRunner): (optional) specify a non-default job runner to use for fastq generation cellranger_jobmode (str): (optional) job mode to run cellranger in (10xGenomics Chromium SC data only) cellranger_mempercore (int): (optional) memory assumed per core (in Gbs) (10xGenomics Chromium SC data only) cellranger_maxjobs (int): (optional) maxiumum number of concurrent jobs to run (10xGenomics Chromium SC data only) cellranger_jobinterval (int): (optional) how often jobs are submitted (in ms) (10xGenomics Chromium SC data only) cellranger_localcores (int): (optional) maximum number of cores cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_localmem (int): (optional) maximum memory cellranger can request in jobmode 'local' (10xGenomics Chromium SC data only) cellranger_ignore_dual_index (bool): (optional) on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it (10xGenomics Chromium SC data only) """ # Report protocol print "Protocol : %s" % protocol if protocol not in MAKE_FASTQS_PROTOCOLS: raise Exception("Unknown protocol: '%s' (must be one of " "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS]))) # Unaligned dir if unaligned_dir is not None: ap.params['unaligned_dir'] = unaligned_dir elif ap.params['unaligned_dir'] is None: ap.params['unaligned_dir'] = 'bcl2fastq' print "Output dir : %s" % ap.params.unaligned_dir # Sample sheet if sample_sheet is None: sample_sheet = ap.params.sample_sheet if not os.path.isabs(sample_sheet): sample_sheet = os.path.join(ap.analysis_dir, sample_sheet) if not os.path.isfile(sample_sheet): raise Exception("Missing sample sheet '%s'" % sample_sheet) ap.params['sample_sheet'] = sample_sheet print "Source sample sheet : %s" % ap.params.sample_sheet # Check requested lanes are actually present print "Lanes : %s" % ('all' if lanes is None else ','.join( [str(l) for l in lanes])) if lanes is not None: s = IlluminaData.SampleSheet(ap.params.sample_sheet) if not s.has_lanes: raise Exception("Requested subset of lanes but " "samplesheet doesn't contain any " "lane information") samplesheet_lanes = list(set([l['Lane'] for l in s])) for l in lanes: if l not in samplesheet_lanes: raise Exception("Requested lane '%d' not present " "in samplesheet" % l) # Make a temporary sample sheet if lanes: lanes_id = ".L%s" % ''.join([str(l) for l in lanes]) else: lanes_id = "" sample_sheet = os.path.join( ap.tmp_dir, "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S"))) make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes) # Check the temporary sample sheet print "Checking temporary sample sheet" invalid_barcodes = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_barcodes() if invalid_barcodes: logger.error("Invalid barcodes detected") for line in invalid_barcodes: logger.critical("%s" % line) invalid_characters = SampleSheetLinter( sample_sheet_file=sample_sheet).has_invalid_characters() if invalid_characters: logger.critical("Invalid non-printing/non-ASCII characters " "detected") if invalid_barcodes or invalid_characters: raise Exception("Errors detected in generated sample sheet") # Adjust verification settings for 10xGenomics Chromium SC # data if necessary verify_include_sample_dir = False if has_chromium_sc_indices(sample_sheet): if protocol in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): # Force inclusion of sample-name subdirectories # when verifying Chromium SC data print "Sample sheet includes Chromium SC indices" verify_include_sample_dir = True else: # Chromium SC indices detected but not using # 10x_chromium_sc protocol raise Exception("Detected 10xGenomics Chromium SC indices " "in generated sample sheet but protocol " "'%s' has been specified; use an " "appropriate '10x_...' protocol for these " "indices" % protocol) # Check for pre-existing Fastq outputs if verify_fastq_generation(ap, unaligned_dir=ap.params.unaligned_dir, lanes=lanes, include_sample_dir=verify_include_sample_dir): print "Expected Fastq outputs already present" skip_rsync = True skip_fastq_generation = True # Check if there's anything to do if (skip_rsync and skip_fastq_generation) and \ not (generate_stats or analyse_barcodes): print "Nothing to do" return # Log dir log_dir = 'make_fastqs' if protocol != 'standard': log_dir += "_%s" % protocol if lanes: log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)]) ap.set_log_dir(ap.get_log_subdir(log_dir)) # Fetch primary data if not skip_rsync and not ap.params.acquired_primary_data: if get_primary_data(ap) != 0: logger.error("Failed to acquire primary data") raise Exception("Failed to acquire primary data") else: ap.params['acquired_primary_data'] = True if only_fetch_primary_data: return # Deal with platform information if not platform: platform = ap.metadata.platform # Do fastq generation using the specified protocol if not skip_fastq_generation: # Set primary data location and report info primary_data_dir = os.path.join(ap.params.primary_data_dir, os.path.basename(ap.params.data_dir)) print "Primary data dir : %s" % primary_data_dir try: illumina_run = IlluminaData.IlluminaRun(primary_data_dir, platform=platform) except IlluminaData.IlluminaDataPlatformError as ex: logger.critical("Error loading primary data: %s" % ex) if platform is None: logger.critical("Try specifying platform using --platform?") else: logger.critical("Check specified platform is valid (or " "omit --platform") raise Exception("Error determining sequencer platform") print "Platform : %s" % illumina_run.platform print "Bcl format : %s" % illumina_run.bcl_extension # Set platform in metadata ap.metadata['platform'] = illumina_run.platform # Bases mask if bases_mask is not None: ap.params['bases_mask'] = bases_mask bases_mask = ap.params.bases_mask print "Bases mask setting : %s" % bases_mask if protocol not in ( '10x_chromium_sc', '10x_chromium_sc_atac', ): if bases_mask == "auto": print "Determining bases mask from RunInfo.xml" bases_mask = get_bases_mask(illumina_run.runinfo_xml, sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Do fastq generation according to protocol if protocol == 'icell8': # ICell8 data # Update bcl2fastq settings appropriately print "Updating read trimming and masking for ICell8" minimum_trimmed_read_length = 21 mask_short_adapter_reads = 0 # Reset the default bases mask bases_mask = IlluminaData.IlluminaRunInfo( illumina_run.runinfo_xml).bases_mask bases_mask = get_icell8_bases_mask(bases_mask, sample_sheet=sample_sheet) if not bases_mask_is_valid(bases_mask): raise Exception("Invalid bases mask: '%s'" % bases_mask) # Switch to standard protocol protocol = 'standard' if protocol == 'standard': # Standard protocol try: exit_code = bcl_to_fastq( ap, unaligned_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, require_bcl2fastq=require_bcl2fastq_version, bases_mask=bases_mask, ignore_missing_bcl=ignore_missing_bcl, ignore_missing_stats=ignore_missing_stats, no_lane_splitting=no_lane_splitting, minimum_trimmed_read_length=minimum_trimmed_read_length, mask_short_adapter_reads=mask_short_adapter_reads, nprocessors=nprocessors, runner=runner) except Exception as ex: raise Exception("Bcl2fastq stage failed: '%s'" % ex) elif protocol == '10x_chromium_sc': # 10xGenomics Chromium SC if bases_mask == 'auto': bases_mask = None try: # Check we have cellranger cellranger = find_program('cellranger') if not cellranger: raise Exception("No cellranger package found") cellranger_software_info = cellranger_info(cellranger) print "Using cellranger %s: %s" % \ (cellranger_software_info[-1], cellranger) # Check we have bcl2fastq bcl2fastq = find_program('bcl2fastq') if not bcl2fastq: raise Exception("No bcl2fastq package found") bcl2fastq = available_bcl2fastq_versions( paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17') if not bcl2fastq: raise Exception("No appropriate bcl2fastq software " "located") bcl2fastq = bcl2fastq[0] bcl2fastq_info = bcl_to_fastq_info(bcl2fastq) print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq) # Store info on bcl2fastq package ap.metadata['bcl2fastq_software'] = bcl2fastq_info # Store info on cellranger package ap.metadata['cellranger_software'] = cellranger_software_info # Put a copy of sample sheet in the log directory shutil.copy(sample_sheet, ap.log_dir) # Determine output directory absolute path output_dir = ap.params.unaligned_dir if not os.path.isabs(output_dir): output_dir = os.path.join(ap.analysis_dir, output_dir) # Run cellranger mkfastq exit_code = run_cellranger_mkfastq( sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, output_dir=output_dir, lanes=(None if lanes is None else ','.join( [str(l) for l in lanes])), bases_mask=bases_mask, cellranger_exe=cellranger, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, working_dir=ap.analysis_dir, log_dir=ap.log_dir) except Exception as ex: raise Exception("'cellranger mkfastq' stage failed: " "'%s'" % ex) # Turn off barcode analysis analyse_barcodes = False elif protocol == '10x_chromium_sc_atac': # 10xGenomics Chromium scATAC-seq exit_code = bcl_to_fastq_10x_chromium_sc_atac( ap, output_dir=ap.params.unaligned_dir, sample_sheet=sample_sheet, primary_data_dir=primary_data_dir, lanes=lanes, bases_mask=bases_mask, cellranger_jobmode=cellranger_jobmode, cellranger_maxjobs=cellranger_maxjobs, cellranger_mempercore=cellranger_mempercore, cellranger_jobinterval=cellranger_jobinterval, cellranger_localcores=cellranger_localcores, cellranger_localmem=cellranger_localmem, log_dir=ap.log_dir) # Turn off barcode analysis analyse_barcodes = False else: # Unknown protocol raise Exception("Unknown protocol '%s'" % protocol) # Check the outputs if exit_code != 0: raise Exception("Fastq generation finished with error: " "exit code %d" % exit_code) if not verify_fastq_generation( ap, lanes=lanes, include_sample_dir=verify_include_sample_dir): # Check failed logger.error("Failed to verify output Fastqs against " "sample sheet") # Try to load the data from unaligned dir try: illumina_data = IlluminaData.IlluminaData( ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir) except IlluminaData.IlluminaDataError as ex: raise Exception("Unable to load data from %s: %s" % (ap.params.unaligned_dir, ex)) # Generate a list of missing Fastqs missing_fastqs = IlluminaData.list_missing_fastqs( illumina_data, sample_sheet, include_sample_dir=verify_include_sample_dir) assert (len(missing_fastqs) > 0) missing_fastqs_file = os.path.join(ap.log_dir, "missing_fastqs.log") print "Writing list of missing Fastq files to %s" % \ missing_fastqs_file with open(missing_fastqs_file, 'w') as fp: for fq in missing_fastqs: fp.write("%s\n" % fq) # Create empty FASTQs if create_empty_fastqs is None: try: create_empty_fastqs = \ ap.settings.platform[ap.metadata.platform].\ create_empty_fastqs except (KeyError, AttributeError): pass if create_empty_fastqs is None: create_empty_fastqs = \ ap.settings.bcl2fastq.create_empty_fastqs if create_empty_fastqs: logger.warning("Making 'empty' placeholder Fastqs") for fq in missing_fastqs: fastq = os.path.join(ap.analysis_dir, ap.params.unaligned_dir, fq) print "-- %s" % fastq if not os.path.exists(os.path.dirname(fastq)): mkdirs(os.path.dirname(fastq)) with gzip.GzipFile(filename=fastq, mode='wb') as fp: fp.write('') else: raise Exception("Fastq generation failed to produce " "expected outputs") # Generate statistics if generate_stats: fastq_statistics(ap, stats_file=stats_file, per_lane_stats_file=per_lane_stats_file, unaligned_dir=ap.params.unaligned_dir, nprocessors=nprocessors, runner=runner) # Run barcode analysis if analyse_barcodes: # Determine output directory if barcode_analysis_dir is not None: ap.params['barcode_analysis_dir'] = barcode_analysis_dir elif ap.params.barcode_analysis_dir is None: ap.params['barcode_analysis_dir'] = 'barcode_analysis' barcode_analysis_dir = ap.params.barcode_analysis_dir if not os.path.isabs(barcode_analysis_dir): barcode_analysis_dir = os.path.join(ap.params.analysis_dir, barcode_analysis_dir) # Report title title = "Barcode analysis for %s" % ap.metadata.run_name # Log file log_file = os.path.join(ap.log_dir, "analyse_barcodes.log") # Set up runner if runner is None: runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) # Get scheduler parameters max_jobs = ap.settings.general.max_concurrent_jobs poll_interval = ap.settings.general.poll_interval # Create and run barcode analysis pipeline barcode_analysis = AnalyseBarcodes( os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir)) barcode_analysis.run(barcode_analysis_dir, title=title, lanes=lanes, sample_sheet=sample_sheet, log_file=log_file, runner=runner, max_jobs=max_jobs, poll_interval=poll_interval, verbose=False) # Make a 'projects.info' metadata file if lanes: ap.update_project_metadata_file() else: ap.make_project_metadata_file() # Remove primary data if remove_primary_data: remove_primary_data(ap)