def run_cellranger_count(fastq_dir, reference_data_path, chemistry='auto', cellranger_exe='cellranger', cellranger_jobmode='local', cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, max_jobs=4, log_dir=None, dry_run=False, summary_only=True): """ Wrapper for running 'cellranger count' Runs the 10xGenomics 'cellranger count' command to perform single library analysis on Fastqs from Chromium single-cell samples. If the supplied 'fastq_dir' is a 'cellranger mkfastq' or 'bcl2fastq' output directory then the analysis will be run for each of the projects. Arguments: fastq_dir (str): path of the 'fastq_path' folder from 'cellranger mkfastq', or the output folder from 'bcl2fastq' (or with a similar structure), or any folder containing Fastq files reference_data_path (str): path to the cellranger compatible transcriptome reference data directory (for scRNA-seq) or ATAC reference genome data (for scATAC-seq) chemistry (str): assay configuration (set to 'auto' to let cellranger determine this automatically; ignored if not scRNA-seq) cellranger_exe (str): optional, name or path to cellranger executable (default: "cellranger") cellranger_jobmode (str): specify the job mode to pass to cellranger (default: "local") cellranger_maxjobs (int): specify the maximum number of jobs to pass to cellranger (default: None) cellranger_mempercore (int): specify the memory per core (in Gb) to pass to cellranger (default: None) cellranger_jobinterval (int): specify the interval between launching jobs (in ms) to pass to cellranger (default: None) cellranger_localcores (int): maximum number of cores cellranger can request in jobmode 'local' (default: None) cellranger_localmem (int): maximum memory cellranger can request in jobmode 'local' (default: None) max_jobs (int): maxiumum number of concurrent count jobs to run; also used for maximum number of jobs each count pipeline can run at once (default: 4) log_dir (str): path to a directory to write logs (default: current working directory) dry_run (bool): if True then only report actions that would be performed but don't run anything summary_only (bool): if True then only collect the output 'web_summary.html' and 'metrics_summary.csv' files, otherwise copy all outputs (warning: this can be very large) Returns: Integer: exit code from the cellranger command. """ # Cellranger mode cellranger_mode = os.path.basename(cellranger_exe) print "Cellranger mode: %s" % cellranger_mode # Input data sample_names = {} try: illumina_data = IlluminaData(os.getcwd(), unaligned_dir=fastq_dir) for project in illumina_data.projects: sample_names[project.name] = [] for sample in project.samples: sample_names[project.name].append(sample.name) except IlluminaDataError: logger.critical("Couldn't load data from '%s'" % fastq_dir) return 1 print "Samples: %s" % sample_names projects = sample_names.keys() # Set up a scheduler sched_reporter = SchedulerReporter( job_start="SCHEDULER: Started #%(job_number)d: %(job_name)s:\n-- %(command)s", job_end= "SCHEDULER: Finished #%(job_number)d: %(job_name)s" ) sched_reporter = SchedulerReporter() sched = SimpleScheduler(max_concurrent=max_jobs, reporter=sched_reporter) sched.start() # Make a log directory if not dry_run: if log_dir is None: log_dir = os.getcwd() log_dir = get_numbered_subdir("%s_count" % cellranger_mode, parent_dir=log_dir, full_path=True) print "Log directory: %s" % log_dir mkdirs(log_dir) # Submit the cellranger count jobs jobs = [] for project in projects: print "Project: %s" % project for sample in sample_names[project]: print "Sample: %s" % sample # Check if outputs already exist count_dir = os.path.abspath( os.path.join(project, "cellranger_count", sample, "outs")) if os.path.isdir(count_dir): print "-- %s: outputs exist, nothing to do" % sample continue else: print "-- %s: setting up %s count" % (sample, cellranger_mode) # Set up job for this sample work_dir = os.path.abspath("tmp.%s_count.%s.%s" % (cellranger_mode, project,sample)) mkdirs(work_dir) print "Working directory: %s" % work_dir cmd = Command(cellranger_exe, "count", "--id",sample, "--fastqs",os.path.abspath(fastq_dir), "--sample",sample) if cellranger_mode == "cellranger": cmd.add_args("--transcriptome",reference_data_path, "--chemistry",chemistry) elif cellranger_mode == "cellranger-atac": cmd.add_args("--reference",reference_data_path) add_cellranger_args(cmd, jobmode=cellranger_jobmode, mempercore=cellranger_mempercore, maxjobs=cellranger_maxjobs, jobinterval=cellranger_jobinterval, localcores=cellranger_localcores, localmem=cellranger_localmem) print "Running: %s" % cmd if not dry_run: job = sched.submit(cmd, name="%s_count.%s.%s" % (cellranger_mode, project, sample), log_dir=log_dir, wd=work_dir) jobs.append(job) sched.wait() sched.stop() # If dry run then stop here if dry_run: return 0 # Finished, check the exit status retval = 0 for job in jobs: retval += job.exit_code if retval != 0: logger.critical("One or more jobs finished with non-zero " "exit code") return retval # Handle outputs for project in projects: print "Project: %s" % project for sample in sample_names[project]: print "Sample: %s" % sample # Destination for count output count_dir = os.path.abspath( os.path.join(project, "cellranger_count", sample)) mkdirs(count_dir) # Copy the cellranger count outputs outs_dir = os.path.join("tmp.%s_count.%s.%s" % (cellranger_mode, project,sample), sample, "outs") if not summary_only: # Collect all outputs print "Copying contents of %s to %s" % (outs_dir,count_dir) shutil.copytree(outs_dir,count_dir) else: # Only collect the web and csv summaries if cellranger_mode == 'cellranger': files = ("web_summary.html","metrics_summary.csv") elif cellranger_mode == 'cellranger-atac': files = ("web_summary.html","summary.csv") count_dir = os.path.join(count_dir,"outs") mkdirs(count_dir) for f in files: path = os.path.join(outs_dir,f) if not os.path.exists(path): logger.warning("%s: not found in %s" % (f,outs_dir)) retval = 1 else: print "Copying %s from %s to %s" % (f, outs_dir, count_dir) shutil.copy(path,count_dir) # Stop if there was an error if retval != 0: logger.critical("Some cellranger count outputs are " "missing") return retval # Create a report and zip archive for each project pwd = os.getcwd() analysis_dir = os.path.basename(pwd) for project in projects: # Descend into project dir os.chdir(project) # Set up zip file report_zip = os.path.join("cellranger_count_report.%s.%s.zip" % (project,analysis_dir)) zip_file = ZipArchive(report_zip, prefix="cellranger_count_report.%s.%s" % (project,analysis_dir)) # Construct index page print "Making report for project %s" % project count_report = Document("%s: cellranger count" % project) count_report.add_css_rule(css_rules.QC_REPORT_CSS_RULES) summaries = count_report.add_section() summaries.add("Reports from cellranger count for each sample:") summary_links = List() for sample in sample_names[project]: # Link to summary for sample web_summary = os.path.join("cellranger_count", sample, "outs", "web_summary.html") print "Adding web summary (%s) for %s" % (web_summary, sample) summary_links.add_item(Link("%s" % sample, web_summary)) # Add to the zip file zip_file.add_file(web_summary) summaries.add(summary_links) # Write the report and add to the zip file html_file = "cellranger_count_report.html" count_report.write(html_file) zip_file.add_file(html_file) # Finish zip_file.close() os.chdir(pwd) # Done return retval
def __init__(self,bcl2fastq_dir=None,sample_sheet=None): """ Create a new AnalyseBarcodes pipeline instance At least one of the bcl2fastq output directory or sample sheet must be supplied when the pipeline is instantiated. If the bcl2fastq output directory is supplied on initialisation then it must exist and already contain output Fastq files. It is possible to set the pipeline up before the bcl2fastq outputs have been generated, as long as the sample sheet is supplied. The bcl2fastq output directory must then be supplied as an input when the pipeline is executed via the 'run' method. Arguments: bcl2fastq_dir (str): path to the directory with outputs from bcl2fastq sample_sheet (str): path to the sample sheet file """ # Initialise the pipeline superclass Pipeline.__init__(self,name="Analyse Barcodes") # Internal parameters self._bcl2fastq_dir = bcl2fastq_dir self._sample_sheet = sample_sheet # Define parameters self.add_param('bcl2fastq_dir',value=self._bcl2fastq_dir,type=str) self.add_param('sample_sheet',value=self._sample_sheet,type=str) self.add_param('barcode_analysis_dir',type=str) self.add_param('counts_dir',type=str) self.add_param('title',type=str) self.add_param('lanes',type=list) self.add_param('bases_mask',type=str) self.add_param('mismatches',type=int) self.add_param('cutoff',type=float,value=0.001) self.add_param('force',type=bool,value=False) # Get a list of projects if self._bcl2fastq_dir is not None: # Load data from bcl2fastq output try: analysis_dir = os.path.abspath( os.path.dirname(self._bcl2fastq_dir)) bcl2fastq_dir = os.path.basename(self._bcl2fastq_dir) illumina_data = IlluminaData(analysis_dir, unaligned_dir=bcl2fastq_dir) except Exception as ex: raise Exception("Unaligned dir '%s' supplied but can't " "load data" % self._bcl2fastq_dir) # Get a list of projects projects = [p.name for p in illumina_data.projects] elif self._sample_sheet is not None: # Load data from sample sheet try: s = SampleSheet(self._sample_sheet) # List of unique project names projects = list(set( [d[s.sample_project_column] if d[s.sample_project_column] else d[s.sample_id_column] for d in s])) except Exception as ex: raise Exception("Sample sheet '%s' supplied but can't " "get a list of project names" % self._sample_sheet) # Check any empty barcode sequences self._check_sample_sheet_indexes(self._sample_sheet) else: raise Exception("Need to supply either unaligned (bcl2fastq " "output) dir or sample sheet") self.report("Expecting projects:") for p in projects: self.report("- %s" % p) #################### # Build the pipeline #################### # Setup barcode analysis and counts directories setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs( "Setup barcode analysis directory", self.params.barcode_analysis_dir, self.params.counts_dir, force=self.params.force) self.add_task(setup_barcode_analysis_dir) # Load the data from the unaligned/bcl2fastq output dir load_illumina_data = LoadIlluminaData( "Load Fastq data for barcode analysis", self.params.bcl2fastq_dir) self.add_task(load_illumina_data) # Generate counts for each project count_tasks = [] for project in projects: count_barcodes = CountBarcodes( "Count barcodes in '%s'" % project, load_illumina_data.output.illumina_data, project, self.params.counts_dir, lanes=self.params.lanes) self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, load_illumina_data)) count_tasks.append(count_barcodes) # Get counts for 'undetermined' count_barcodes = CountBarcodes( "Count barcodes in 'undetermined'", load_illumina_data.output.illumina_data, "__undetermined__", self.params.counts_dir, lanes=self.params.lanes, use_project_name="undetermined") self.add_task(count_barcodes, requires=(setup_barcode_analysis_dir, load_illumina_data)) count_tasks.append(count_barcodes) # List the counts files list_counts_files = ListBarcodeCountFiles( "Gather the barcode counts files", self.params.counts_dir) self.add_task(list_counts_files, requires=count_tasks) # Analyse counts and report the results report_barcodes = ReportBarcodeAnalysis( "Report barcode analysis", list_counts_files.output.counts_files, self.params.barcode_analysis_dir, sample_sheet=self.params.sample_sheet, lanes=self.params.lanes, mismatches=self.params.mismatches, cutoff=self.params.cutoff, title=self.params.title ) self.add_task(report_barcodes, requires=(list_counts_files,)) # Add final outputs to the pipeline self.add_output('report_file',report_barcodes.output.report_file) self.add_output('xls_file',report_barcodes.output.xls_file) self.add_output('html_file',report_barcodes.output.html_file)
def setup(ap, data_dir, analysis_dir=None, sample_sheet=None, extra_files=None, unaligned_dir=None): """ Set up the initial analysis directory This does all the initialisation of the analysis directory and processing parameters Arguments: ap (AutoProcess): autoprocessor pointing to the analysis directory to create Fastqs for data_dir (str): source data directory analysis_dir (str): corresponding analysis directory sample_sheet (str): name and location of non-default sample sheet file; can be a local or remote file, or a URL (optional, will use sample sheet from the source data directory if present) extra_files (list): arbitrary additional files to copy into the new analysis directory; each file can be a local or remote file or a URL unaligned_dir (str): directory with existing Fastqs output from CASAVA or bcl2fastq2; if specified then Fastqs will be taken from this directory (optional) """ data_dir = data_dir.rstrip(os.sep) if not exists(data_dir): raise Exception("Data directory '%s' not found" % data_dir) if not Location(data_dir).is_remote: data_dir = os.path.abspath(data_dir) run_name = os.path.basename(data_dir) if analysis_dir is None: analysis_dir = os.path.join(os.getcwd(), run_name) + '_analysis' else: analysis_dir = os.path.abspath(analysis_dir) # Create the analysis directory structure if not os.path.exists(analysis_dir): # Make a temporary analysis dir tmp_analysis_dir = os.path.join( os.path.dirname(analysis_dir), ".%s.%s" % (os.path.basename(analysis_dir), uuid.uuid4())) ap.analysis_dir = tmp_analysis_dir logger.debug("Creating temp directory '%s'" % ap.analysis_dir) # Create directory structure ap.create_directory(ap.analysis_dir) ap.log_dir ap.script_code_dir else: # Directory already exists logger.warning("Analysis directory '%s' already exists" % analysis_dir) ap.analysis_dir = analysis_dir # check for parameter file if ap.has_parameter_file: ap.load_parameters() else: logger.warning("No parameter file found in %s" % ap.analysis_dir) # Run datestamp, instrument name and instrument run number try: datestamp,instrument,run_number,flow_cell_prefix,flow_cell_id = \ split_run_name_full(run_name) run_number = run_number.lstrip('0') flow_cell = flow_cell_prefix + flow_cell_id except Exception as ex: logger.warning("Unable to extract information from run name '%s'" \ % run_name) logger.warning("Exception: %s" % ex) datestamp = None instrument = None run_number = None flow_cell = None # Identify missing data and attempt to acquire # Sequencing platform platform = ap.metadata.platform if platform is None: platform = get_sequencer_platform(data_dir, instrument=instrument, settings=ap.settings) print("Platform identified as '%s'" % platform) # Sequencer model model = ap.metadata.sequencer_model if model is None: try: model = ap.settings.sequencers[instrument]['model'] except KeyError: pass if model: print("Sequencer model identified as '%s'" % model) # Log dir ap.set_log_dir(ap.get_log_subdir('setup')) # Attempt to acquire sample sheet try: # Custom SampleSheet.csv file custom_sample_sheet = ap.params.sample_sheet if custom_sample_sheet is not None: # Sample sheet already stored original_sample_sheet = os.path.join(ap.analysis_dir, 'SampleSheet.orig.csv') print("Sample sheet '%s'" % custom_sample_sheet) else: # Look for sample sheet print("Acquiring sample sheet...") if sample_sheet is None: targets = ( 'Data/Intensities/BaseCalls/SampleSheet.csv', 'SampleSheet.csv', ) else: targets = (sample_sheet, ) # Try each possibility until one sticks for target in targets: target = Location(target) tmp_sample_sheet = os.path.join(ap.tmp_dir, os.path.basename(target.path)) if target.is_url: # Try fetching samplesheet from URL print("Trying '%s'" % target.url) try: urlfp = urlopen(target.url) with open(tmp_sample_sheet, 'w') as fp: fp.write(urlfp.read().decode()) except URLError as ex: # Failed to download from URL raise Exception("Error fetching sample sheet data " "from '%s': %s" % (target.url, ex)) else: # Assume target samplesheet is a file on a local # or remote server if target.is_remote: target_sample_sheet = str(target) else: if os.path.isabs(target.path): target_sample_sheet = target.path else: target_sample_sheet = os.path.join( data_dir, target.path) print("Trying '%s'" % target_sample_sheet) rsync = general_applications.rsync(target_sample_sheet, ap.tmp_dir) print("%s" % rsync) status = rsync.run_subprocess( log=ap.log_path('rsync.sample_sheet.log')) if status != 0: logger.warning("Failed to fetch sample sheet '%s'" % target_sample_sheet) tmp_sample_sheet = None else: break # Bail out if no sample sheet was acquired if tmp_sample_sheet is None: raise Exception("Unable to acquire sample sheet") # Keep a copy of the original sample sheet original_sample_sheet = os.path.join(ap.analysis_dir, 'SampleSheet.orig.csv') print("Copying original sample sheet to %s" % original_sample_sheet) shutil.copyfile(tmp_sample_sheet, original_sample_sheet) # Set the permissions for the original SampleSheet os.chmod(original_sample_sheet, 0o664) # Process acquired sample sheet custom_sample_sheet = os.path.join(ap.analysis_dir, 'custom_SampleSheet.csv') make_custom_sample_sheet(tmp_sample_sheet, custom_sample_sheet) except Exception as ex: # Failed to acquire sample sheet if not unaligned_dir: # Fatal error try: # Remove temporary directory shutil.rmtree(tmp_analysis_dir) ap.analysis_dir = None except Exception: pass raise Exception("Failed to acquire sample sheet: %s" % ex) else: # Don't need sample sheet if Fastqs already exist original_sample_sheet = None custom_sample_sheet = None # Bases mask print("Bases mask set to 'auto' (will be determined at run time)") bases_mask = "auto" # Data source metadata data_source = ap.settings.metadata.default_data_source # Generate and print predicted outputs and warnings if custom_sample_sheet is not None: sample_sheet_data = SampleSheet(custom_sample_sheet) print(predict_outputs(sample_sheet=sample_sheet_data)) check_and_warn(sample_sheet=sample_sheet_data) # Import additional files if extra_files: for extra_file in extra_files: print("Importing '%s'" % extra_file) extra_file = Location(extra_file) if extra_file.is_url: # Try fetching file from URL try: urlfp = urlopen(extra_file.url) with open( os.path.join(ap.analysis_dir, os.path.basename(extra_file.path)), 'w') as fp: fp.write(urlfp.read().decode()) except URLError as ex: # Failed to download from URL raise Exception("Error fetching '%s': %s" % (extra_file.url, ex)) else: # File is on a local or remote server if extra_file.is_remote: extra_file_path = str(extra_file) else: extra_file_path = os.path.abspath(extra_file.path) rsync = general_applications.rsync(extra_file_path, ap.analysis_dir) status = rsync.run_subprocess( log=ap.log_path('rsync.extra_file.log')) if status != 0: raise Exception("Failed to fetch '%s'" % extra_file_path) # Check supplied unaligned Fastq dir if unaligned_dir is not None: try: illumina_data = IlluminaData(data_dir, unaligned_dir=unaligned_dir) unaligned_dir = illumina_data.unaligned_dir except IlluminaDataError: # Fatal error try: # Remove temporary directory shutil.rmtree(tmp_analysis_dir) ap.analysis_dir = None except Exception: pass raise Exception("Can't get data from Fastq dir '%s'" % unaligned_dir) else: # No unaligned dir supplied unaligned_dir = ap.params.unaligned_dir # Move analysis dir to final location if necessary if ap.analysis_dir != analysis_dir: logger.debug("Moving %s to final directory" % ap.analysis_dir) os.rename(ap.analysis_dir, analysis_dir) ap.analysis_dir = analysis_dir # Update the custom sample sheet path if custom_sample_sheet is not None: custom_sample_sheet = os.path.join( analysis_dir, os.path.basename(custom_sample_sheet)) print("Created analysis directory '%s'" % ap.analysis_dir) # Store the parameters ap.params['data_dir'] = data_dir ap.params['analysis_dir'] = ap.analysis_dir ap.params['sample_sheet'] = custom_sample_sheet ap.params['bases_mask'] = bases_mask ap.params['unaligned_dir'] = unaligned_dir ap.params['acquired_primary_data'] = False # Store the metadata ap.metadata['run_name'] = ap.run_name ap.metadata['platform'] = platform ap.metadata['instrument_name'] = instrument ap.metadata['instrument_datestamp'] = datestamp ap.metadata['instrument_run_number'] = run_number ap.metadata['instrument_flow_cell_id'] = flow_cell ap.metadata['sequencer_model'] = model ap.metadata['source'] = data_source # Make a 'projects.info' metadata file if not ap.params.project_metadata: if unaligned_dir is not None: ap.make_project_metadata_file() # Set flags to allow parameters etc to be saved back ap._save_params = True ap._save_metadata = True ap.save_data()
def setup(self): # Load the data from bcl2fastq print("Loading data from %s" % self.args.bcl2fastq_dir) illumina_data = IlluminaData(os.path.dirname(self.args.bcl2fastq_dir), os.path.basename(self.args.bcl2fastq_dir)) self.output.illumina_data.set(illumina_data)