def test_simple_scheduler_wait_for(self): """Wait for named jobs to complete """ self.log_dir = tempfile.mkdtemp() sched = SimpleScheduler(runner=SimpleJobRunner(log_dir=self.log_dir), poll_interval=0.01) sched.start() job_1 = sched.submit(['sleep', '10'], name='sleep_10') job_2 = sched.submit(['sleep', '30'], name='sleep_30') job_3 = sched.submit(['sleep', '5'], name='sleep_5') self.assertFalse(job_1.completed) self.assertFalse(job_2.completed) self.assertFalse(job_3.completed) # Wait for job to finish try: sched.wait_for(('sleep_5', 'sleep_10'), timeout=10) except SchedulerTimeout: sched.stop() job_1.terminate() job_2.terminate() job_3.terminate() self.fail("'wait_for' timed out") self.assertTrue(job_1.completed) self.assertFalse(job_2.completed) self.assertTrue(job_3.completed) sched.stop()
def test_qcpipeline_with_strandedness(self): """QCPipeline: standard QC run with strandedness determination """ # Make mock illumina_qc.sh and multiqc MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh")) MockMultiQC.create(os.path.join(self.bin, "multiqc")) MockFastqStrandPy.create(os.path.join(self.bin, "fastq_strand.py")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Make mock analysis project p = MockAnalysisProject( "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz", "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz"), metadata={'Organism': 'Human'}) p.create(top_dir=self.wd) # Set up and run the QC runqc = QCPipeline() runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")), multiqc=True) status = runqc.run( fastq_strand_indexes={'human': '/data/hg38/star_index'}, poll_interval=0.5, max_jobs=1, runners={ 'default': SimpleJobRunner(), }) # Check output and reports self.assertEqual(status, 0) for f in ("qc", "qc_report.html", "qc_report.PJB.%s.zip" % os.path.basename(self.wd), "multiqc_report.html"): self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)), "Missing %s" % f)
def test_failing_job_with_simplejobrunner(self): """Test Job using SimpleJobRunner to run failing shell command """ # Create a job cmd = "ls" args = ("*.whereisit", ) job = Job(SimpleJobRunner(), "failing_cmd", self.working_dir, cmd, args) # Start the job job_id = job.start() # Wait to let job complete and check time.sleep(1) job.update() self.assertEqual(job.name, "failing_cmd") self.assertEqual(job.working_dir, self.working_dir) self.assertEqual(job.script, cmd) self.assertEqual(job.args, args) self.assertEqual(job.label, None) self.assertEqual(job.group_label, None) self.assertEqual(job.job_id, job_id) self.assertNotEqual(job.log, None) self.assertNotEqual(job.start_time, None) self.assertNotEqual(job.end_time, None) self.assertFalse(job.isRunning()) self.assertEqual(job.exit_status, 2) self.assertFalse(job.errorState()) self.assertEqual(job.status(), "Finished")
def test_qcpipeline(self): """QCPipeline: standard QC run """ # Make mock illumina_qc.sh and multiqc MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh")) MockMultiQC.create(os.path.join(self.bin, "multiqc")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Make mock analysis project p = MockAnalysisProject( "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz", "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz")) p.create(top_dir=self.wd) # Set up and run the QC runqc = QCPipeline() runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")), multiqc=True) status = runqc.run(poll_interval=0.5, max_jobs=1, runners={ 'default': SimpleJobRunner(), }) # Check output and reports self.assertEqual(status, 0) for f in ("qc", "qc_report.html", "qc_report.PJB.%s.zip" % os.path.basename(self.wd), "multiqc_report.html"): self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)), "Missing %s" % f)
def test_qcpipeline_with_batching_fails_for_missing_outputs(self): """QCPipeline: standard QC run with batching fails for missing outputs """ # Make mock illumina_qc.sh and multiqc MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh"), fastqc=False, exit_code=1) MockMultiQC.create(os.path.join(self.bin, "multiqc")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Make mock analysis project p = MockAnalysisProject( "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz", "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz")) p.create(top_dir=self.wd) # Set up and run the QC runqc = QCPipeline() runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")), multiqc=True) status = runqc.run(poll_interval=0.5, max_jobs=1, batch_size=3, runners={ 'default': SimpleJobRunner(), }) # Check output and reports self.assertEqual(status, 1) self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", "qc")), "Missing 'qc'") for f in ("qc_report.html", "qc_report.PJB.%s.zip" % os.path.basename(self.wd), "multiqc_report.html"): self.assertFalse(os.path.exists(os.path.join(self.wd, "PJB", f)), "Found %s, shouldn't be present" % f)
def test_job_with_simplejobrunner(self): """Test Job using SimpleJobRunner to run basic shell command """ # Create a job cmd = "sleep" args = ("2", ) job = Job(SimpleJobRunner(), "shell_cmd", self.working_dir, cmd, args) # Check properties before starting self.assertEqual(job.name, "shell_cmd") self.assertEqual(job.working_dir, self.working_dir) self.assertEqual(job.script, cmd) self.assertEqual(job.args, args) self.assertEqual(job.label, None) self.assertEqual(job.group_label, None) self.assertEqual(job.job_id, None) self.assertEqual(job.log, None) self.assertEqual(job.start_time, None) self.assertEqual(job.end_time, None) self.assertEqual(job.exit_status, None) # Check status self.assertFalse(job.isRunning()) self.assertFalse(job.errorState()) self.assertEqual(job.status(), "Waiting") # Start the job and check job_id = job.start() time.sleep(1) self.assertEqual(job.name, "shell_cmd") self.assertEqual(job.working_dir, self.working_dir) self.assertEqual(job.script, cmd) self.assertEqual(job.args, args) self.assertEqual(job.label, None) self.assertEqual(job.group_label, None) self.assertEqual(job.job_id, job_id) self.assertNotEqual(job.log, None) self.assertNotEqual(job.start_time, None) self.assertEqual(job.end_time, None) self.assertEqual(job.exit_status, None) self.assertTrue(job.isRunning()) self.assertFalse(job.errorState()) self.assertEqual(job.status(), "Running") # Wait to let job complete and check last time time.sleep(2) job.update() self.assertEqual(job.name, "shell_cmd") self.assertEqual(job.working_dir, self.working_dir) self.assertEqual(job.script, cmd) self.assertEqual(job.args, args) self.assertEqual(job.label, None) self.assertEqual(job.group_label, None) self.assertEqual(job.job_id, job_id) self.assertNotEqual(job.log, None) self.assertNotEqual(job.start_time, None) self.assertNotEqual(job.end_time, None) self.assertFalse(job.isRunning()) self.assertEqual(job.exit_status, 0) self.assertFalse(job.errorState()) self.assertEqual(job.status(), "Finished")
def test_scheduler_job_wait_timeout_raises_exception(self): """SchedulerJob raises exception if 'wait' timeout exceeded """ self.log_dir = tempfile.mkdtemp() job = SchedulerJob(SimpleJobRunner(log_dir=self.log_dir), ['sleep', '1000']) job.start() self.assertRaises(SchedulerTimeout, job.wait, poll_interval=0.01, timeout=5)
def test_scheduler_job_wait(self): """Wait for SchedulerJob to complete """ self.log_dir = tempfile.mkdtemp() job = SchedulerJob(SimpleJobRunner(log_dir=self.log_dir), ['sleep', '5']) self.assertFalse(job.completed) try: job.start() job.wait(poll_interval=0.01, timeout=10) except SchedulerTimeout: self.fail("'wait' timed out") self.assertTrue(job.completed)
def test_qcpipeline_non_default_log_dir(self): """QCPipeline: standard QC run using non-default log dir """ # Make mock illumina_qc.sh and multiqc MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh")) MockMultiQC.create(os.path.join(self.bin, "multiqc")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Make mock analysis project p = MockAnalysisProject( "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz")) p.create(top_dir=self.wd) # Non-default log dir log_dir = os.path.join(self.wd, "logs") self.assertFalse(os.path.exists(log_dir), "Log dir '%s' already exists" % log_dir) # Set up and run the QC runqc = QCPipeline() runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")), multiqc=True, log_dir=log_dir) status = runqc.run(poll_interval=0.5, max_jobs=1, runners={ 'default': SimpleJobRunner(), }) # Check output and reports self.assertEqual(status, 0) self.assertTrue(os.path.isdir(os.path.join(self.wd, "PJB", "qc")), "'qc' directory doesn't exist, but should") for f in ("qc_report.html", "qc_report.PJB.%s.zip" % os.path.basename(self.wd), "multiqc_report.html"): self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)), "Missing %s" % f) # Check log directory self.assertTrue(os.path.exists(log_dir), "Log dir '%s' not found" % log_dir)
def run_cellranger_mkfastq(sample_sheet, primary_data_dir, output_dir, lanes=None, bases_mask=None, cellranger_jobmode=None, cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, log_dir=None, dry_run=False): """ Wrapper for running 'cellranger mkfastq' Runs the 10xGenomics 'cellranger mkfastq' command to generate Fastqs from bcl files for Chromium single-cell data. Arguments: sample_sheet (str): path to input samplesheet with 10xGenomics barcode indices primary_data_dir (str): path to the top-level directory holding the sequencing data output_dir (str): path to the output directory lanes (str): optional, specify the subset of lanes to process (default is to process all lanes in the run) bases_mask (str): optional, specify an alternative bases mask setting (default is to let cellranger determine the bases mask automatically) cellranger_jobmode (str): specify the job mode to pass to cellranger (default: None) cellranger_maxjobs (int): specify the maximum number of jobs to pass to cellranger (default: None) cellranger_mempercore (int): specify the memory per core (in Gb) to pass to cellranger (default: None) cellranger_jobinterval (int): specify the interval between launching jobs (in ms) to pass to cellranger (default: None) log_dir (str): path to a directory to write logs (default: current working directory) dry_run (bool): if True then only report actions that would be performed but don't run anything Returns: Integer: exit code from the cellranger command. """ # Construct the cellranger command cmd = Command("cellranger", "mkfastq", "--samplesheet", sample_sheet, "--run", primary_data_dir, "--output-dir", output_dir) if lanes is not None: cmd.add_args("--lanes=%s" % lanes) if bases_mask is not None: cmd.add_args("--use-bases-mask=%s" % bases_mask) add_cellranger_args(cmd, jobmode=cellranger_jobmode, mempercore=cellranger_mempercore, maxjobs=cellranger_maxjobs, jobinterval=cellranger_jobinterval) # Run the command print "Running %s" % cmd if not dry_run: # Make a log directory if log_dir is None: log_dir = os.getcwd() else: log_dir = os.path.abspath(log_dir) # Submit the job cellranger_mkfastq_job = SchedulerJob(SimpleJobRunner(join_logs=True), cmd.command_line, name='cellranger_mkfastq', working_dir=os.getcwd(), log_dir=log_dir) cellranger_mkfastq_job.start() try: cellranger_mkfastq_job.wait() except KeyboardInterrupt, ex: logger.warning("Keyboard interrupt, terminating cellranger") cellranger_mkfastq_job.terminate() raise ex exit_code = cellranger_mkfastq_job.exit_code print "cellranger mkfastq completed: exit code %s" % exit_code if exit_code != 0: logger.error("cellranger mkfastq exited with an error") return exit_code # Deal with the QC summary report flow_cell_dir = flow_cell_id(primary_data_dir) if lanes is not None: lanes_suffix = "_%s" % lanes.replace(',', '') else: lanes_suffix = "" flow_cell_dir = "%s%s" % (flow_cell_dir, lanes_suffix) if not os.path.isdir(flow_cell_dir): logger.error("No output directory '%s'" % flow_cell_dir) return -1 json_file = os.path.join(flow_cell_dir, "outs", "qc_summary.json") html_file = "cellranger_qc_summary%s.html" % lanes_suffix make_qc_summary_html(json_file, html_file) return exit_code
nthreads_star = int(math.ceil(32.0/mempercore)) print("-- Threads for STAR: %s" % nthreads_star) # Remove limit on number of jobs print("-- Set maximum no of jobs to 'unlimited'") max_jobs = None # (Re)set cellranger parameters for --local print("-- Cellranger will run in jobmode 'local'") cellranger_jobmode = "local" cellranger_mempercore = None cellranger_jobinterval = None cellranger_localcores = min(max_cores,16) cellranger_localmem = max_mem print("-- Cellranger localcores: %s" % cellranger_localcores) print("-- Cellranger localmem : %s" % cellranger_localmem) # Set up local runners default_runner = SimpleJobRunner() runners = { 'cellranger_runner': SimpleJobRunner(nslots=cellranger_localcores), 'fastqc_runner': SimpleJobRunner(nslots=nthreads), 'fastq_screen_runner': SimpleJobRunner(nslots=nthreads), 'star_runner': SimpleJobRunner(nslots=nthreads_star), 'verify_runner': default_runner, 'report_runner': default_runner, } else: # Set up according to the configuration and # command line options # Set number of threads for QC jobs if args.nthreads: nthreads = args.nthreads else:
def test_pipelinerunner(self): pr = PipelineRunner(SimpleJobRunner(), poll_interval=1) pr.queueJob(self.working_dir, 'ls', '-l') pr.run(blocking=True)
def main(): """ """ # Load configuration settings = Settings() # Collect defaults default_runner = settings.runners.rsync # Get pre-defined destinations destinations = [name for name in settings.destination] # Command line p = argparse.ArgumentParser( description="Transfer copies of Fastq data from an analysis " "project to an arbitrary destination for sharing with other " "people") p.add_argument('--version', action='version', version=("%%(prog)s %s" % get_version())) p.add_argument('--subdir', action='store', choices=('random_bin', 'run_id'), default=None, help="subdirectory naming scheme: 'random_bin' " "locates a random pre-existing empty subdirectory " "under the target directory; 'run_id' creates a " "new subdirectory " "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this " "option is not set then no subdirectory will be " "used") p.add_argument('--readme', action='store', metavar='README_TEMPLATE', dest='readme_template', help="template file to generate README file from; " "can be full path to a template file, or the name " "of a file in the 'templates' directory") p.add_argument('--weburl', action='store', help="base URL for webserver (sets the value of " "the WEBURL variable in the template README)") p.add_argument('--include_downloader', action='store_true', help="copy the 'download_fastqs.py' utility to the " "final location") p.add_argument('--include_qc_report', action='store_true', help="copy the zipped QC reports to the final " "location") p.add_argument('--include_10x_outputs', action='store_true', help="copy outputs from 10xGenomics pipelines (e.g. " "'cellranger count') to the final location") p.add_argument('--link', action='store_true', help="hard link files instead of copying") p.add_argument('--runner', action='store', help="specify the job runner to use for executing " "the checksumming, Fastq copy and tar gzipping " "operations (defaults to job runner defined for " "copying in config file [%s])" % default_runner) p.add_argument('dest', action='store', metavar="DEST", help="destination to copy Fastqs to; can be the " "name of a destination defined in the configuration " "file, or an arbitrary location of the form " "'[[USER@]HOST:]DIR' (%s)" % (("available destinations: %s" % (','.join("'%s'" % d for d in sorted(destinations)))) if destinations else "no destinations currently defined")) p.add_argument('project', action='store', metavar="PROJECT", help="path to project directory (or to a Fastqs " "subdirectory in a project) to copy Fastqs from") # Process command line args = p.parse_args() # Check if target is pre-defined destination if args.dest in destinations: print("Loading settings for destination '%s'" % args.dest) dest = settings.destination[args.dest] target_dir = dest.directory readme_template = dest.readme_template subdir = dest.subdir include_downloader = dest.include_downloader include_qc_report = dest.include_qc_report hard_links = dest.hard_links weburl = dest.url else: target_dir = args.dest readme_template = None subdir = None include_downloader = False include_qc_report = False hard_links = False weburl = None # Update defaults with command line values if args.readme_template: readme_template = args.readme_template if args.subdir: subdir = args.subdir if args.include_downloader: include_downloader = True if args.include_qc_report: include_qc_report = True if args.weburl: weburl = args.weburl if args.link: hard_links = args.link # Sort out project directory project = AnalysisProject(args.project) if not project.is_analysis_dir: # Assume it's the Fastq dir fastq_dir = os.path.basename(args.project) project = AnalysisProject(os.path.dirname(args.project)) else: fastq_dir = None if not project.is_analysis_dir: logger.error("'%s': project not found" % args.project) return 1 project_name = project.name # Parent analysis directory analysis_dir = AnalysisDir(os.path.dirname(project.dirn)) # Fastqs directory try: project.use_fastq_dir(fastq_dir) except Exception as ex: logger.error("'%s': failed to load Fastq set '%s': %s" % (project.name, fastq_dir, ex)) return 1 # Report print("Transferring data from '%s' (%s)" % (project.name, project.dirn)) print("Fastqs in %s" % project.fastq_dir) # Summarise samples and Fastqs samples = set() nfastqs = 0 fsize = 0 for sample in project.samples: samples.add(sample.name) for fq in sample.fastq: fsize += os.lstat(fq).st_size nfastqs += 1 nsamples = len(samples) dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform if project.info.single_cell_platform else '', project.info.library_type) endedness = "paired-end" if project.info.paired_end else "single-end" print("%s with %d Fastqs from %d %s sample%s totalling %s" % (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '', format_file_size(fsize))) # Check target dir if not Location(target_dir).is_remote: target_dir = os.path.abspath(target_dir) if not exists(target_dir): print("'%s': target directory not found" % target_dir) return else: print("Target directory %s" % target_dir) # Locate downloader if include_downloader: print("Locating downloader for inclusion") downloader = find_program("download_fastqs.py") if downloader is None: logging.error("Unable to locate download_fastqs.py") return 1 print("... found %s" % downloader) else: downloader = None # Locate zipped QC report if include_qc_report: print("Locating zipped QC reports for inclusion") qc_zips = list() # Check QC directories and look for zipped reports for qc_dir in project.qc_dirs: # Get the associated Fastq set # NB only compare the basename of the Fastq dir # in case full paths weren't updated fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir) if fq_set == os.path.basename(project.fastq_dir): for qc_base in ( "%s_report.%s.%s" % (qc_dir, project.name, project.info.run), "%s_report.%s.%s" % (qc_dir, project.name, os.path.basename(analysis_dir.analysis_dir)), ): qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base) if os.path.exists(qc_zip): print("... found %s" % qc_zip) qc_zips.append(qc_zip) if not qc_zips: logger.error("No zipped QC reports found") return 1 else: qc_zips = None # Locate 10xGenomics outputs if args.include_10x_outputs: print("Locating outputs from 10xGenomics pipelines for " "inclusion") cellranger_dirs = list() for d in ( 'cellranger_count', 'cellranger_multi', ): cellranger_dir = os.path.join(project.dirn, d) if os.path.isdir(cellranger_dir): print("... found %s" % cellranger_dir) cellranger_dirs.append(cellranger_dir) if not cellranger_dirs: logger.error("No outputs from 10xGenomics pipelines found") return 1 else: cellranger_dirs = None # Determine subdirectory if subdir == "random_bin": # Find a random empty directory under the # target directory print("Locating random empty bin") subdirs = [ d for d in os.listdir(target_dir) if os.path.isdir(os.path.join(target_dir, d)) ] if not subdirs: print("Failed to locate subdirectories") return shuffle(subdirs) subdir = None for d in subdirs: if not os.listdir(os.path.join(target_dir, d)): # Empty bin subdir = d break if subdir is None: print("Failed to locate empty subdirectory") return print("... found '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) elif subdir == "run_id": # Construct subdirectory name based on the # run ID subdir = "{platform}_{datestamp}.{run_number}-{project}".format( platform=analysis_dir.metadata.platform.upper(), datestamp=analysis_dir.metadata.instrument_datestamp, run_number=analysis_dir.metadata.run_number, project=project.name) # Check it doesn't already exist if exists(os.path.join(target_dir, subdir)): logger.error("'%s': subdirectory already exists" % subdir) return print("Using subdirectory '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) # Make target directory if not exists(target_dir): mkdir(target_dir) # Get runner for copy job if args.runner: runner = fetch_runner(args.runner) else: runner = default_runner # Set identifier for jobs job_id = "%s%s" % (project_name, (".%s" % fastq_dir if fastq_dir is not None else '')) # Set the working directory working_dir = os.path.abspath("transfer.%s.%s" % (job_id, int(time.time()))) mkdir(working_dir) print("Created working dir %s" % working_dir) # Construct the README if readme_template: # Check that template file exists print("Locating README template") template = None for filen in ( readme_template, os.path.join(get_templates_dir(), readme_template), ): if os.path.exists(filen): template = filen break if template is None: logger.error("'%s': template file not found" % readme_template) return 1 else: readme_template = template print("... found %s" % readme_template) # Read in template with open(readme_template, 'rt') as fp: readme = fp.read() # Substitute template variables template_vars = { 'PLATFORM': analysis_dir.metadata.platform.upper(), 'RUN_NUMBER': analysis_dir.metadata.run_number, 'DATESTAMP': analysis_dir.metadata.instrument_datestamp, 'PROJECT': project_name, 'WEBURL': weburl, 'BIN': subdir, 'DIR': target_dir, 'TODAY': date.today().strftime("%d/%m/%Y"), } for var in template_vars: value = template_vars[var] if value is None: value = '?' else: value = str(value) readme = re.sub(r"%{var}%".format(var=var), value, readme) # Write out a temporary README file readme_file = os.path.join(working_dir, "README") with open(readme_file, 'wt') as fp: fp.write(readme) else: # No README readme_file = None # Start a scheduler to run jobs sched = SimpleScheduler(runner=runner, reporter=TransferDataSchedulerReporter(), poll_interval=settings.general.poll_interval) sched.start() # Build command to run manage_fastqs.py copy_cmd = Command("manage_fastqs.py") if hard_links: copy_cmd.add_args("--link") copy_cmd.add_args(analysis_dir.analysis_dir, project_name) if fastq_dir is not None: copy_cmd.add_args(fastq_dir) copy_cmd.add_args("copy", target_dir) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copy.%s" % job_id, wd=working_dir) # Copy README if readme_file is not None: print("Copying README file") copy_cmd = copy_command(readme_file, os.path.join(target_dir, "README")) sched.submit(copy_cmd.command_line, name="copy.%s.readme" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy download_fastqs.py if downloader: print("Copying downloader") copy_cmd = copy_command( downloader, os.path.join(target_dir, os.path.basename(downloader))) sched.submit(copy_cmd.command_line, name="copy.%s.downloader" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy QC reports if qc_zips: for qc_zip in qc_zips: print("Copying '%s'" % os.path.basename(qc_zip)) copy_cmd = copy_command(qc_zip, os.path.join(target_dir, os.path.basename(qc_zip)), link=hard_links) sched.submit(copy_cmd.command_line, name="copy.%s.%s" % (job_id, os.path.basename(qc_zip)), runner=SimpleJobRunner(), wd=working_dir) # Tar and copy 10xGenomics outputs if cellranger_dirs: for cellranger_dir in cellranger_dirs: print("Tar gzipping and copying '%s'" % os.path.basename(cellranger_dir)) # Tar & gzip data targz = os.path.join( working_dir, "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir), project_name, project.info.run)) targz_cmd = Command("tar", "czvhf", targz, "-C", os.path.dirname(cellranger_dir), os.path.basename(cellranger_dir)) print("Running %s" % targz_cmd) targz_job = sched.submit( targz_cmd.command_line, name="targz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), wd=working_dir) # Copy the targz file copy_cmd = copy_command( targz, os.path.join(target_dir, os.path.basename(targz))) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copytgz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), runner=SimpleJobRunner(), wd=working_dir, wait_for=(targz_job.job_name, )) # Wait for scheduler jobs to complete sched.wait() # Check exit code for Fastq copying exit_code = copy_job.exit_code if exit_code != 0: logger.error("File copy exited with an error") return exit_code else: print("Files now at %s" % target_dir) if weburl: url = weburl if subdir is not None: url = os.path.join(url, subdir) print("URL: %s" % url) print("Done")
def run_cellranger_mkfastq(sample_sheet, primary_data_dir, output_dir, lanes=None, bases_mask=None, ignore_dual_index=False, cellranger_exe='cellranger', cellranger_jobmode='local', cellranger_maxjobs=None, cellranger_mempercore=None, cellranger_jobinterval=None, cellranger_localcores=None, cellranger_localmem=None, working_dir=None, log_dir=None, dry_run=False): """ Wrapper for running 'cellranger mkfastq' Runs the 10xGenomics 'cellranger mkfastq' command to generate Fastqs from bcl files for Chromium single-cell data. To run the 'mkfastq' command using a different version of cellranger (e.g. cellranger-atac), specify the cellranger executable using the 'cellranger_exe' argument. Arguments: sample_sheet (str): path to input samplesheet with 10xGenomics barcode indices primary_data_dir (str): path to the top-level directory holding the sequencing data output_dir (str): path to the output directory lanes (str): optional, specify the subset of lanes to process (default is to process all lanes in the run) bases_mask (str): optional, specify an alternative bases mask setting (default is to let cellranger determine the bases mask automatically) ignore_dual_index (bool): optional, on a dual-indexed flowcell where the second index was not used for the 10x sample, ignore it cellranger_exe (str): optional, name or path to cellranger executable (default: "cellranger") cellranger_jobmode (str): specify the job mode to pass to cellranger (default: "local") cellranger_maxjobs (int): specify the maximum number of jobs to pass to cellranger (default: None) cellranger_mempercore (int): specify the memory per core (in Gb) to pass to cellranger (default: None) cellranger_jobinterval (int): specify the interval between launching jobs (in ms) to pass to cellranger (default: None) cellranger_localcores (int): maximum number of cores cellranger can request in jobmode 'local' (default: None) cellranger_localmem (int): maximum memory cellranger can request in jobmode 'local' (default: None) working_dir (str): path to a directory to use as as the working directory (default: current working directory) log_dir (str): path to a directory to write logs (default: current working directory) dry_run (bool): if True then only report actions that would be performed but don't run anything Returns: Integer: exit code from the cellranger command. """ # Working directory if working_dir is None: working_dir = os.getcwd() # Check for existing cellranger outputs flow_cell_dir = os.path.join(working_dir, flow_cell_id(primary_data_dir)) if lanes is not None: lanes_suffix = "_%s" % lanes.replace(',','') else: lanes_suffix = "" flow_cell_dir = "%s%s" % (flow_cell_dir,lanes_suffix) mro_file = os.path.join(working_dir, "__%s.mro" % os.path.basename(flow_cell_dir)) if not dry_run: if os.path.exists(flow_cell_dir): logger.warning("Removing existing output directory: %s" % flow_cell_dir) shutil.rmtree(flow_cell_dir) if os.path.exists(mro_file): logger.warning("Removing existing mro file: %s" % mro_file) os.remove(mro_file) # Construct the cellranger command cmd = Command(cellranger_exe, "mkfastq", "--samplesheet",sample_sheet, "--run",primary_data_dir, "--output-dir",output_dir, "--qc") if lanes is not None: cmd.add_args("--lanes=%s" % lanes) if bases_mask is not None: cmd.add_args("--use-bases-mask=%s" % bases_mask) if ignore_dual_index: cmd.add_args("--ignore-dual-index") add_cellranger_args(cmd, jobmode=cellranger_jobmode, mempercore=cellranger_mempercore, maxjobs=cellranger_maxjobs, jobinterval=cellranger_jobinterval, localcores=cellranger_localcores, localmem=cellranger_localmem) # Run the command print "Running %s" % cmd if not dry_run: # Sort out the working directory if working_dir is None: working_dir = os.getcwd() else: working_dir = os.path.abspath(working_dir) # Make a log directory if log_dir is None: log_dir = os.getcwd() else: log_dir = os.path.abspath(log_dir) # Submit the job cellranger_mkfastq_job = SchedulerJob( SimpleJobRunner(join_logs=True), cmd.command_line, name='cellranger_mkfastq', working_dir=working_dir, log_dir=log_dir) cellranger_mkfastq_job.start() try: cellranger_mkfastq_job.wait() except KeyboardInterrupt,ex: logger.warning("Keyboard interrupt, terminating cellranger") cellranger_mkfastq_job.terminate() raise ex exit_code = cellranger_mkfastq_job.exit_code print "cellranger mkfastq completed: exit code %s" % exit_code if exit_code != 0: logger.error("cellranger mkfastq exited with an error") return exit_code # Check outputs and QC summary report if not os.path.isdir(flow_cell_dir): logger.error("No output directory '%s'" % flow_cell_dir) return -1 json_file = os.path.join(flow_cell_dir, "outs", "qc_summary.json") if not os.path.exists(json_file): logger.error("cellranger mkfastq failed to make " "JSON QC summary file (%s not found)" % json_file) return -1 # Make HTML QC summary html_file = os.path.join(working_dir, "cellranger_qc_summary%s.html" % lanes_suffix) if os.path.exists(html_file): logger.warning("Removing existing HTML QC summary file: %s" % html_file) os.remove(html_file) make_qc_summary_html(json_file,html_file) if not os.path.exists(html_file): logger.error("Failed to create HTML QC summary file " "(%s not found)" % html_file) return -1 return exit_code