Exemple #1
0
    def test_simple_scheduler_wait_for(self):
        """Wait for named jobs to complete

        """
        self.log_dir = tempfile.mkdtemp()
        sched = SimpleScheduler(runner=SimpleJobRunner(log_dir=self.log_dir),
                                poll_interval=0.01)
        sched.start()
        job_1 = sched.submit(['sleep', '10'], name='sleep_10')
        job_2 = sched.submit(['sleep', '30'], name='sleep_30')
        job_3 = sched.submit(['sleep', '5'], name='sleep_5')
        self.assertFalse(job_1.completed)
        self.assertFalse(job_2.completed)
        self.assertFalse(job_3.completed)
        # Wait for job to finish
        try:
            sched.wait_for(('sleep_5', 'sleep_10'), timeout=10)
        except SchedulerTimeout:
            sched.stop()
            job_1.terminate()
            job_2.terminate()
            job_3.terminate()
            self.fail("'wait_for' timed out")
        self.assertTrue(job_1.completed)
        self.assertFalse(job_2.completed)
        self.assertTrue(job_3.completed)
        sched.stop()
Exemple #2
0
 def test_qcpipeline_with_strandedness(self):
     """QCPipeline: standard QC run with strandedness determination
     """
     # Make mock illumina_qc.sh and multiqc
     MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh"))
     MockMultiQC.create(os.path.join(self.bin, "multiqc"))
     MockFastqStrandPy.create(os.path.join(self.bin, "fastq_strand.py"))
     os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH'])
     # Make mock analysis project
     p = MockAnalysisProject(
         "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz",
                 "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz"),
         metadata={'Organism': 'Human'})
     p.create(top_dir=self.wd)
     # Set up and run the QC
     runqc = QCPipeline()
     runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")),
                       multiqc=True)
     status = runqc.run(
         fastq_strand_indexes={'human': '/data/hg38/star_index'},
         poll_interval=0.5,
         max_jobs=1,
         runners={
             'default': SimpleJobRunner(),
         })
     # Check output and reports
     self.assertEqual(status, 0)
     for f in ("qc", "qc_report.html",
               "qc_report.PJB.%s.zip" % os.path.basename(self.wd),
               "multiqc_report.html"):
         self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)),
                         "Missing %s" % f)
Exemple #3
0
 def test_failing_job_with_simplejobrunner(self):
     """Test Job using SimpleJobRunner to run failing shell command
     """
     # Create a job
     cmd = "ls"
     args = ("*.whereisit", )
     job = Job(SimpleJobRunner(), "failing_cmd", self.working_dir, cmd,
               args)
     # Start the job
     job_id = job.start()
     # Wait to let job complete and check
     time.sleep(1)
     job.update()
     self.assertEqual(job.name, "failing_cmd")
     self.assertEqual(job.working_dir, self.working_dir)
     self.assertEqual(job.script, cmd)
     self.assertEqual(job.args, args)
     self.assertEqual(job.label, None)
     self.assertEqual(job.group_label, None)
     self.assertEqual(job.job_id, job_id)
     self.assertNotEqual(job.log, None)
     self.assertNotEqual(job.start_time, None)
     self.assertNotEqual(job.end_time, None)
     self.assertFalse(job.isRunning())
     self.assertEqual(job.exit_status, 2)
     self.assertFalse(job.errorState())
     self.assertEqual(job.status(), "Finished")
Exemple #4
0
 def test_qcpipeline(self):
     """QCPipeline: standard QC run
     """
     # Make mock illumina_qc.sh and multiqc
     MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh"))
     MockMultiQC.create(os.path.join(self.bin, "multiqc"))
     os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH'])
     # Make mock analysis project
     p = MockAnalysisProject(
         "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz",
                 "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz"))
     p.create(top_dir=self.wd)
     # Set up and run the QC
     runqc = QCPipeline()
     runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")),
                       multiqc=True)
     status = runqc.run(poll_interval=0.5,
                        max_jobs=1,
                        runners={
                            'default': SimpleJobRunner(),
                        })
     # Check output and reports
     self.assertEqual(status, 0)
     for f in ("qc", "qc_report.html",
               "qc_report.PJB.%s.zip" % os.path.basename(self.wd),
               "multiqc_report.html"):
         self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)),
                         "Missing %s" % f)
Exemple #5
0
 def test_qcpipeline_with_batching_fails_for_missing_outputs(self):
     """QCPipeline: standard QC run with batching fails for missing outputs
     """
     # Make mock illumina_qc.sh and multiqc
     MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh"),
                             fastqc=False,
                             exit_code=1)
     MockMultiQC.create(os.path.join(self.bin, "multiqc"))
     os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH'])
     # Make mock analysis project
     p = MockAnalysisProject(
         "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz",
                 "PJB2_S2_R1_001.fastq.gz", "PJB2_S2_R2_001.fastq.gz"))
     p.create(top_dir=self.wd)
     # Set up and run the QC
     runqc = QCPipeline()
     runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")),
                       multiqc=True)
     status = runqc.run(poll_interval=0.5,
                        max_jobs=1,
                        batch_size=3,
                        runners={
                            'default': SimpleJobRunner(),
                        })
     # Check output and reports
     self.assertEqual(status, 1)
     self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", "qc")),
                     "Missing 'qc'")
     for f in ("qc_report.html",
               "qc_report.PJB.%s.zip" % os.path.basename(self.wd),
               "multiqc_report.html"):
         self.assertFalse(os.path.exists(os.path.join(self.wd, "PJB", f)),
                          "Found %s, shouldn't be present" % f)
Exemple #6
0
 def test_job_with_simplejobrunner(self):
     """Test Job using SimpleJobRunner to run basic shell command
     """
     # Create a job
     cmd = "sleep"
     args = ("2", )
     job = Job(SimpleJobRunner(), "shell_cmd", self.working_dir, cmd, args)
     # Check properties before starting
     self.assertEqual(job.name, "shell_cmd")
     self.assertEqual(job.working_dir, self.working_dir)
     self.assertEqual(job.script, cmd)
     self.assertEqual(job.args, args)
     self.assertEqual(job.label, None)
     self.assertEqual(job.group_label, None)
     self.assertEqual(job.job_id, None)
     self.assertEqual(job.log, None)
     self.assertEqual(job.start_time, None)
     self.assertEqual(job.end_time, None)
     self.assertEqual(job.exit_status, None)
     # Check status
     self.assertFalse(job.isRunning())
     self.assertFalse(job.errorState())
     self.assertEqual(job.status(), "Waiting")
     # Start the job and check
     job_id = job.start()
     time.sleep(1)
     self.assertEqual(job.name, "shell_cmd")
     self.assertEqual(job.working_dir, self.working_dir)
     self.assertEqual(job.script, cmd)
     self.assertEqual(job.args, args)
     self.assertEqual(job.label, None)
     self.assertEqual(job.group_label, None)
     self.assertEqual(job.job_id, job_id)
     self.assertNotEqual(job.log, None)
     self.assertNotEqual(job.start_time, None)
     self.assertEqual(job.end_time, None)
     self.assertEqual(job.exit_status, None)
     self.assertTrue(job.isRunning())
     self.assertFalse(job.errorState())
     self.assertEqual(job.status(), "Running")
     # Wait to let job complete and check last time
     time.sleep(2)
     job.update()
     self.assertEqual(job.name, "shell_cmd")
     self.assertEqual(job.working_dir, self.working_dir)
     self.assertEqual(job.script, cmd)
     self.assertEqual(job.args, args)
     self.assertEqual(job.label, None)
     self.assertEqual(job.group_label, None)
     self.assertEqual(job.job_id, job_id)
     self.assertNotEqual(job.log, None)
     self.assertNotEqual(job.start_time, None)
     self.assertNotEqual(job.end_time, None)
     self.assertFalse(job.isRunning())
     self.assertEqual(job.exit_status, 0)
     self.assertFalse(job.errorState())
     self.assertEqual(job.status(), "Finished")
Exemple #7
0
 def test_scheduler_job_wait_timeout_raises_exception(self):
     """SchedulerJob raises exception if 'wait' timeout exceeded
     """
     self.log_dir = tempfile.mkdtemp()
     job = SchedulerJob(SimpleJobRunner(log_dir=self.log_dir),
                        ['sleep', '1000'])
     job.start()
     self.assertRaises(SchedulerTimeout,
                       job.wait,
                       poll_interval=0.01,
                       timeout=5)
Exemple #8
0
 def test_scheduler_job_wait(self):
     """Wait for SchedulerJob to complete
     """
     self.log_dir = tempfile.mkdtemp()
     job = SchedulerJob(SimpleJobRunner(log_dir=self.log_dir),
                        ['sleep', '5'])
     self.assertFalse(job.completed)
     try:
         job.start()
         job.wait(poll_interval=0.01, timeout=10)
     except SchedulerTimeout:
         self.fail("'wait' timed out")
     self.assertTrue(job.completed)
Exemple #9
0
 def test_qcpipeline_non_default_log_dir(self):
     """QCPipeline: standard QC run using non-default log dir
     """
     # Make mock illumina_qc.sh and multiqc
     MockIlluminaQcSh.create(os.path.join(self.bin, "illumina_qc.sh"))
     MockMultiQC.create(os.path.join(self.bin, "multiqc"))
     os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH'])
     # Make mock analysis project
     p = MockAnalysisProject(
         "PJB", ("PJB1_S1_R1_001.fastq.gz", "PJB1_S1_R2_001.fastq.gz"))
     p.create(top_dir=self.wd)
     # Non-default log dir
     log_dir = os.path.join(self.wd, "logs")
     self.assertFalse(os.path.exists(log_dir),
                      "Log dir '%s' already exists" % log_dir)
     # Set up and run the QC
     runqc = QCPipeline()
     runqc.add_project(AnalysisProject("PJB", os.path.join(self.wd, "PJB")),
                       multiqc=True,
                       log_dir=log_dir)
     status = runqc.run(poll_interval=0.5,
                        max_jobs=1,
                        runners={
                            'default': SimpleJobRunner(),
                        })
     # Check output and reports
     self.assertEqual(status, 0)
     self.assertTrue(os.path.isdir(os.path.join(self.wd, "PJB", "qc")),
                     "'qc' directory doesn't exist, but should")
     for f in ("qc_report.html",
               "qc_report.PJB.%s.zip" % os.path.basename(self.wd),
               "multiqc_report.html"):
         self.assertTrue(os.path.exists(os.path.join(self.wd, "PJB", f)),
                         "Missing %s" % f)
     # Check log directory
     self.assertTrue(os.path.exists(log_dir),
                     "Log dir '%s' not found" % log_dir)
Exemple #10
0
def run_cellranger_mkfastq(sample_sheet,
                           primary_data_dir,
                           output_dir,
                           lanes=None,
                           bases_mask=None,
                           cellranger_jobmode=None,
                           cellranger_maxjobs=None,
                           cellranger_mempercore=None,
                           cellranger_jobinterval=None,
                           log_dir=None,
                           dry_run=False):
    """
    Wrapper for running 'cellranger mkfastq'

    Runs the 10xGenomics 'cellranger mkfastq' command to
    generate Fastqs from bcl files for Chromium single-cell
    data.

    Arguments:
      sample_sheet (str): path to input samplesheet with
        10xGenomics barcode indices
      primary_data_dir (str): path to the top-level
        directory holding the sequencing data
      output_dir (str): path to the output directory
      lanes (str): optional, specify the subset of lanes
        to process (default is to process all lanes
        in the run)
      bases_mask (str): optional, specify an alternative
        bases mask setting (default is to let cellranger
        determine the bases mask automatically)
      cellranger_jobmode (str): specify the job mode to
        pass to cellranger (default: None)
      cellranger_maxjobs (int): specify the maximum
        number of jobs to pass to cellranger (default:
        None)
      cellranger_mempercore (int): specify the memory
        per core (in Gb) to pass to cellranger (default:
        None)
      cellranger_jobinterval (int): specify the interval
        between launching jobs (in ms) to pass to
        cellranger (default: None)
      log_dir (str): path to a directory to write logs
        (default: current working directory)
      dry_run (bool): if True then only report actions
        that would be performed but don't run anything

    Returns:
      Integer: exit code from the cellranger command.
    """
    # Construct the cellranger command
    cmd = Command("cellranger", "mkfastq", "--samplesheet", sample_sheet,
                  "--run", primary_data_dir, "--output-dir", output_dir)
    if lanes is not None:
        cmd.add_args("--lanes=%s" % lanes)
    if bases_mask is not None:
        cmd.add_args("--use-bases-mask=%s" % bases_mask)
    add_cellranger_args(cmd,
                        jobmode=cellranger_jobmode,
                        mempercore=cellranger_mempercore,
                        maxjobs=cellranger_maxjobs,
                        jobinterval=cellranger_jobinterval)
    # Run the command
    print "Running %s" % cmd
    if not dry_run:
        # Make a log directory
        if log_dir is None:
            log_dir = os.getcwd()
        else:
            log_dir = os.path.abspath(log_dir)
        # Submit the job
        cellranger_mkfastq_job = SchedulerJob(SimpleJobRunner(join_logs=True),
                                              cmd.command_line,
                                              name='cellranger_mkfastq',
                                              working_dir=os.getcwd(),
                                              log_dir=log_dir)
        cellranger_mkfastq_job.start()
        try:
            cellranger_mkfastq_job.wait()
        except KeyboardInterrupt, ex:
            logger.warning("Keyboard interrupt, terminating cellranger")
            cellranger_mkfastq_job.terminate()
            raise ex
        exit_code = cellranger_mkfastq_job.exit_code
        print "cellranger mkfastq completed: exit code %s" % exit_code
        if exit_code != 0:
            logger.error("cellranger mkfastq exited with an error")
            return exit_code
        # Deal with the QC summary report
        flow_cell_dir = flow_cell_id(primary_data_dir)
        if lanes is not None:
            lanes_suffix = "_%s" % lanes.replace(',', '')
        else:
            lanes_suffix = ""
        flow_cell_dir = "%s%s" % (flow_cell_dir, lanes_suffix)
        if not os.path.isdir(flow_cell_dir):
            logger.error("No output directory '%s'" % flow_cell_dir)
            return -1
        json_file = os.path.join(flow_cell_dir, "outs", "qc_summary.json")
        html_file = "cellranger_qc_summary%s.html" % lanes_suffix
        make_qc_summary_html(json_file, html_file)
        return exit_code
Exemple #11
0
         nthreads_star = int(math.ceil(32.0/mempercore))
     print("-- Threads for STAR: %s" % nthreads_star)
     # Remove limit on number of jobs
     print("-- Set maximum no of jobs to 'unlimited'")
     max_jobs = None
     # (Re)set cellranger parameters for --local
     print("-- Cellranger will run in jobmode 'local'")
     cellranger_jobmode = "local"
     cellranger_mempercore = None
     cellranger_jobinterval = None
     cellranger_localcores = min(max_cores,16)
     cellranger_localmem = max_mem
     print("-- Cellranger localcores: %s" % cellranger_localcores)
     print("-- Cellranger localmem  : %s" % cellranger_localmem)
     # Set up local runners
     default_runner = SimpleJobRunner()
     runners = {
         'cellranger_runner': SimpleJobRunner(nslots=cellranger_localcores),
         'fastqc_runner': SimpleJobRunner(nslots=nthreads),
         'fastq_screen_runner': SimpleJobRunner(nslots=nthreads),
         'star_runner': SimpleJobRunner(nslots=nthreads_star),
         'verify_runner': default_runner,
         'report_runner': default_runner,
     }
 else:
     # Set up according to the configuration and
     # command line options
     # Set number of threads for QC jobs
     if args.nthreads:
         nthreads = args.nthreads
     else:
Exemple #12
0
 def test_pipelinerunner(self):
     pr = PipelineRunner(SimpleJobRunner(), poll_interval=1)
     pr.queueJob(self.working_dir, 'ls', '-l')
     pr.run(blocking=True)
Exemple #13
0
def main():
    """
    """
    # Load configuration
    settings = Settings()

    # Collect defaults
    default_runner = settings.runners.rsync

    # Get pre-defined destinations
    destinations = [name for name in settings.destination]

    # Command line
    p = argparse.ArgumentParser(
        description="Transfer copies of Fastq data from an analysis "
        "project to an arbitrary destination for sharing with other "
        "people")
    p.add_argument('--version',
                   action='version',
                   version=("%%(prog)s %s" % get_version()))
    p.add_argument('--subdir',
                   action='store',
                   choices=('random_bin', 'run_id'),
                   default=None,
                   help="subdirectory naming scheme: 'random_bin' "
                   "locates a random pre-existing empty subdirectory "
                   "under the target directory; 'run_id' creates a "
                   "new subdirectory "
                   "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this "
                   "option is not set then no subdirectory will be "
                   "used")
    p.add_argument('--readme',
                   action='store',
                   metavar='README_TEMPLATE',
                   dest='readme_template',
                   help="template file to generate README file from; "
                   "can be full path to a template file, or the name "
                   "of a file in the 'templates' directory")
    p.add_argument('--weburl',
                   action='store',
                   help="base URL for webserver (sets the value of "
                   "the WEBURL variable in the template README)")
    p.add_argument('--include_downloader',
                   action='store_true',
                   help="copy the 'download_fastqs.py' utility to the "
                   "final location")
    p.add_argument('--include_qc_report',
                   action='store_true',
                   help="copy the zipped QC reports to the final "
                   "location")
    p.add_argument('--include_10x_outputs',
                   action='store_true',
                   help="copy outputs from 10xGenomics pipelines (e.g. "
                   "'cellranger count') to the final location")
    p.add_argument('--link',
                   action='store_true',
                   help="hard link files instead of copying")
    p.add_argument('--runner',
                   action='store',
                   help="specify the job runner to use for executing "
                   "the checksumming, Fastq copy and tar gzipping "
                   "operations (defaults to job runner defined for "
                   "copying in config file [%s])" % default_runner)
    p.add_argument('dest',
                   action='store',
                   metavar="DEST",
                   help="destination to copy Fastqs to; can be the "
                   "name of a destination defined in the configuration "
                   "file, or an arbitrary location of the form "
                   "'[[USER@]HOST:]DIR' (%s)" %
                   (("available destinations: %s" %
                     (','.join("'%s'" % d for d in sorted(destinations))))
                    if destinations else "no destinations currently defined"))
    p.add_argument('project',
                   action='store',
                   metavar="PROJECT",
                   help="path to project directory (or to a Fastqs "
                   "subdirectory in a project) to copy Fastqs from")

    # Process command line
    args = p.parse_args()

    # Check if target is pre-defined destination
    if args.dest in destinations:
        print("Loading settings for destination '%s'" % args.dest)
        dest = settings.destination[args.dest]
        target_dir = dest.directory
        readme_template = dest.readme_template
        subdir = dest.subdir
        include_downloader = dest.include_downloader
        include_qc_report = dest.include_qc_report
        hard_links = dest.hard_links
        weburl = dest.url
    else:
        target_dir = args.dest
        readme_template = None
        subdir = None
        include_downloader = False
        include_qc_report = False
        hard_links = False
        weburl = None

    # Update defaults with command line values
    if args.readme_template:
        readme_template = args.readme_template
    if args.subdir:
        subdir = args.subdir
    if args.include_downloader:
        include_downloader = True
    if args.include_qc_report:
        include_qc_report = True
    if args.weburl:
        weburl = args.weburl
    if args.link:
        hard_links = args.link

    # Sort out project directory
    project = AnalysisProject(args.project)
    if not project.is_analysis_dir:
        # Assume it's the Fastq dir
        fastq_dir = os.path.basename(args.project)
        project = AnalysisProject(os.path.dirname(args.project))
    else:
        fastq_dir = None
    if not project.is_analysis_dir:
        logger.error("'%s': project not found" % args.project)
        return 1
    project_name = project.name

    # Parent analysis directory
    analysis_dir = AnalysisDir(os.path.dirname(project.dirn))

    # Fastqs directory
    try:
        project.use_fastq_dir(fastq_dir)
    except Exception as ex:
        logger.error("'%s': failed to load Fastq set '%s': %s" %
                     (project.name, fastq_dir, ex))
        return 1

    # Report
    print("Transferring data from '%s' (%s)" % (project.name, project.dirn))
    print("Fastqs in %s" % project.fastq_dir)

    # Summarise samples and Fastqs
    samples = set()
    nfastqs = 0
    fsize = 0
    for sample in project.samples:
        samples.add(sample.name)
        for fq in sample.fastq:
            fsize += os.lstat(fq).st_size
            nfastqs += 1
    nsamples = len(samples)
    dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform
                                if project.info.single_cell_platform else '',
                                project.info.library_type)
    endedness = "paired-end" if project.info.paired_end else "single-end"
    print("%s with %d Fastqs from %d %s sample%s totalling %s" %
          (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '',
           format_file_size(fsize)))

    # Check target dir
    if not Location(target_dir).is_remote:
        target_dir = os.path.abspath(target_dir)
    if not exists(target_dir):
        print("'%s': target directory not found" % target_dir)
        return
    else:
        print("Target directory %s" % target_dir)

    # Locate downloader
    if include_downloader:
        print("Locating downloader for inclusion")
        downloader = find_program("download_fastqs.py")
        if downloader is None:
            logging.error("Unable to locate download_fastqs.py")
            return 1
        print("... found %s" % downloader)
    else:
        downloader = None

    # Locate zipped QC report
    if include_qc_report:
        print("Locating zipped QC reports for inclusion")
        qc_zips = list()
        # Check QC directories and look for zipped reports
        for qc_dir in project.qc_dirs:
            # Get the associated Fastq set
            # NB only compare the basename of the Fastq dir
            # in case full paths weren't updated
            fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir)
            if fq_set == os.path.basename(project.fastq_dir):
                for qc_base in (
                        "%s_report.%s.%s" %
                    (qc_dir, project.name, project.info.run),
                        "%s_report.%s.%s" %
                    (qc_dir, project.name,
                     os.path.basename(analysis_dir.analysis_dir)),
                ):
                    qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base)
                    if os.path.exists(qc_zip):
                        print("... found %s" % qc_zip)
                        qc_zips.append(qc_zip)
        if not qc_zips:
            logger.error("No zipped QC reports found")
            return 1
    else:
        qc_zips = None

    # Locate 10xGenomics outputs
    if args.include_10x_outputs:
        print("Locating outputs from 10xGenomics pipelines for " "inclusion")
        cellranger_dirs = list()
        for d in (
                'cellranger_count',
                'cellranger_multi',
        ):
            cellranger_dir = os.path.join(project.dirn, d)
            if os.path.isdir(cellranger_dir):
                print("... found %s" % cellranger_dir)
                cellranger_dirs.append(cellranger_dir)
        if not cellranger_dirs:
            logger.error("No outputs from 10xGenomics pipelines found")
            return 1
    else:
        cellranger_dirs = None

    # Determine subdirectory
    if subdir == "random_bin":
        # Find a random empty directory under the
        # target directory
        print("Locating random empty bin")
        subdirs = [
            d for d in os.listdir(target_dir)
            if os.path.isdir(os.path.join(target_dir, d))
        ]
        if not subdirs:
            print("Failed to locate subdirectories")
            return
        shuffle(subdirs)
        subdir = None
        for d in subdirs:
            if not os.listdir(os.path.join(target_dir, d)):
                # Empty bin
                subdir = d
                break
        if subdir is None:
            print("Failed to locate empty subdirectory")
            return
        print("... found '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)
    elif subdir == "run_id":
        # Construct subdirectory name based on the
        # run ID
        subdir = "{platform}_{datestamp}.{run_number}-{project}".format(
            platform=analysis_dir.metadata.platform.upper(),
            datestamp=analysis_dir.metadata.instrument_datestamp,
            run_number=analysis_dir.metadata.run_number,
            project=project.name)
        # Check it doesn't already exist
        if exists(os.path.join(target_dir, subdir)):
            logger.error("'%s': subdirectory already exists" % subdir)
            return
        print("Using subdirectory '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)

    # Make target directory
    if not exists(target_dir):
        mkdir(target_dir)

    # Get runner for copy job
    if args.runner:
        runner = fetch_runner(args.runner)
    else:
        runner = default_runner

    # Set identifier for jobs
    job_id = "%s%s" % (project_name,
                       (".%s" % fastq_dir if fastq_dir is not None else ''))

    # Set the working directory
    working_dir = os.path.abspath("transfer.%s.%s" %
                                  (job_id, int(time.time())))
    mkdir(working_dir)
    print("Created working dir %s" % working_dir)

    # Construct the README
    if readme_template:
        # Check that template file exists
        print("Locating README template")
        template = None
        for filen in (
                readme_template,
                os.path.join(get_templates_dir(), readme_template),
        ):
            if os.path.exists(filen):
                template = filen
                break
        if template is None:
            logger.error("'%s': template file not found" % readme_template)
            return 1
        else:
            readme_template = template
        print("... found %s" % readme_template)
        # Read in template
        with open(readme_template, 'rt') as fp:
            readme = fp.read()
        # Substitute template variables
        template_vars = {
            'PLATFORM': analysis_dir.metadata.platform.upper(),
            'RUN_NUMBER': analysis_dir.metadata.run_number,
            'DATESTAMP': analysis_dir.metadata.instrument_datestamp,
            'PROJECT': project_name,
            'WEBURL': weburl,
            'BIN': subdir,
            'DIR': target_dir,
            'TODAY': date.today().strftime("%d/%m/%Y"),
        }
        for var in template_vars:
            value = template_vars[var]
            if value is None:
                value = '?'
            else:
                value = str(value)
            readme = re.sub(r"%{var}%".format(var=var), value, readme)
        # Write out a temporary README file
        readme_file = os.path.join(working_dir, "README")
        with open(readme_file, 'wt') as fp:
            fp.write(readme)
    else:
        # No README
        readme_file = None

    # Start a scheduler to run jobs
    sched = SimpleScheduler(runner=runner,
                            reporter=TransferDataSchedulerReporter(),
                            poll_interval=settings.general.poll_interval)
    sched.start()

    # Build command to run manage_fastqs.py
    copy_cmd = Command("manage_fastqs.py")
    if hard_links:
        copy_cmd.add_args("--link")
    copy_cmd.add_args(analysis_dir.analysis_dir, project_name)
    if fastq_dir is not None:
        copy_cmd.add_args(fastq_dir)
    copy_cmd.add_args("copy", target_dir)
    print("Running %s" % copy_cmd)
    copy_job = sched.submit(copy_cmd.command_line,
                            name="copy.%s" % job_id,
                            wd=working_dir)

    # Copy README
    if readme_file is not None:
        print("Copying README file")
        copy_cmd = copy_command(readme_file,
                                os.path.join(target_dir, "README"))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.readme" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy download_fastqs.py
    if downloader:
        print("Copying downloader")
        copy_cmd = copy_command(
            downloader, os.path.join(target_dir, os.path.basename(downloader)))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.downloader" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy QC reports
    if qc_zips:
        for qc_zip in qc_zips:
            print("Copying '%s'" % os.path.basename(qc_zip))
            copy_cmd = copy_command(qc_zip,
                                    os.path.join(target_dir,
                                                 os.path.basename(qc_zip)),
                                    link=hard_links)
            sched.submit(copy_cmd.command_line,
                         name="copy.%s.%s" %
                         (job_id, os.path.basename(qc_zip)),
                         runner=SimpleJobRunner(),
                         wd=working_dir)

    # Tar and copy 10xGenomics outputs
    if cellranger_dirs:
        for cellranger_dir in cellranger_dirs:
            print("Tar gzipping and copying '%s'" %
                  os.path.basename(cellranger_dir))
            # Tar & gzip data
            targz = os.path.join(
                working_dir,
                "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir),
                                  project_name, project.info.run))
            targz_cmd = Command("tar", "czvhf", targz, "-C",
                                os.path.dirname(cellranger_dir),
                                os.path.basename(cellranger_dir))
            print("Running %s" % targz_cmd)
            targz_job = sched.submit(
                targz_cmd.command_line,
                name="targz.%s.%s" %
                (job_id, os.path.basename(cellranger_dir)),
                wd=working_dir)
            # Copy the targz file
            copy_cmd = copy_command(
                targz, os.path.join(target_dir, os.path.basename(targz)))
            print("Running %s" % copy_cmd)
            copy_job = sched.submit(copy_cmd.command_line,
                                    name="copytgz.%s.%s" %
                                    (job_id, os.path.basename(cellranger_dir)),
                                    runner=SimpleJobRunner(),
                                    wd=working_dir,
                                    wait_for=(targz_job.job_name, ))

    # Wait for scheduler jobs to complete
    sched.wait()

    # Check exit code for Fastq copying
    exit_code = copy_job.exit_code
    if exit_code != 0:
        logger.error("File copy exited with an error")
        return exit_code
    else:
        print("Files now at %s" % target_dir)
        if weburl:
            url = weburl
            if subdir is not None:
                url = os.path.join(url, subdir)
            print("URL: %s" % url)
        print("Done")
def run_cellranger_mkfastq(sample_sheet,
                           primary_data_dir,
                           output_dir,
                           lanes=None,
                           bases_mask=None,
                           ignore_dual_index=False,
                           cellranger_exe='cellranger',
                           cellranger_jobmode='local',
                           cellranger_maxjobs=None,
                           cellranger_mempercore=None,
                           cellranger_jobinterval=None,
                           cellranger_localcores=None,
                           cellranger_localmem=None,
                           working_dir=None,
                           log_dir=None,
                           dry_run=False):
    """
    Wrapper for running 'cellranger mkfastq'

    Runs the 10xGenomics 'cellranger mkfastq' command to
    generate Fastqs from bcl files for Chromium single-cell
    data.

    To run the 'mkfastq' command using a different version
    of cellranger (e.g. cellranger-atac), specify the
    cellranger executable using the 'cellranger_exe'
    argument.

    Arguments:
      sample_sheet (str): path to input samplesheet with
        10xGenomics barcode indices
      primary_data_dir (str): path to the top-level
        directory holding the sequencing data
      output_dir (str): path to the output directory
      lanes (str): optional, specify the subset of lanes
        to process (default is to process all lanes
        in the run)
      bases_mask (str): optional, specify an alternative
        bases mask setting (default is to let cellranger
        determine the bases mask automatically)
      ignore_dual_index (bool): optional, on a dual-indexed
        flowcell where the second index was not used for
        the 10x sample, ignore it
      cellranger_exe (str): optional, name or path to
        cellranger executable (default: "cellranger")
      cellranger_jobmode (str): specify the job mode to
        pass to cellranger (default: "local")
      cellranger_maxjobs (int): specify the maximum
        number of jobs to pass to cellranger (default:
        None)
      cellranger_mempercore (int): specify the memory
        per core (in Gb) to pass to cellranger (default:
        None)
      cellranger_jobinterval (int): specify the interval
        between launching jobs (in ms) to pass to
        cellranger (default: None)
      cellranger_localcores (int): maximum number of cores
        cellranger can request in jobmode 'local'
        (default: None)
      cellranger_localmem (int): maximum memory cellranger
        can request in jobmode 'local' (default: None)
      working_dir (str): path to a directory to use as
        as the working directory (default: current
        working directory)
      log_dir (str): path to a directory to write logs
        (default: current working directory)
      dry_run (bool): if True then only report actions
        that would be performed but don't run anything

    Returns:
      Integer: exit code from the cellranger command.
    """
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    # Check for existing cellranger outputs
    flow_cell_dir = os.path.join(working_dir,
                                 flow_cell_id(primary_data_dir))
    if lanes is not None:
        lanes_suffix = "_%s" % lanes.replace(',','')
    else:
        lanes_suffix = ""
    flow_cell_dir = "%s%s" % (flow_cell_dir,lanes_suffix)
    mro_file = os.path.join(working_dir,
                            "__%s.mro" %
                            os.path.basename(flow_cell_dir))
    if not dry_run:
        if os.path.exists(flow_cell_dir):
            logger.warning("Removing existing output directory: %s" %
                           flow_cell_dir)
            shutil.rmtree(flow_cell_dir)
        if os.path.exists(mro_file):
            logger.warning("Removing existing mro file: %s" % mro_file)
            os.remove(mro_file)
    # Construct the cellranger command
    cmd = Command(cellranger_exe,
                  "mkfastq",
                  "--samplesheet",sample_sheet,
                  "--run",primary_data_dir,
                  "--output-dir",output_dir,
                  "--qc")
    if lanes is not None:
        cmd.add_args("--lanes=%s" % lanes)
    if bases_mask is not None:
        cmd.add_args("--use-bases-mask=%s" % bases_mask)
    if ignore_dual_index:
        cmd.add_args("--ignore-dual-index")
    add_cellranger_args(cmd,
                        jobmode=cellranger_jobmode,
                        mempercore=cellranger_mempercore,
                        maxjobs=cellranger_maxjobs,
                        jobinterval=cellranger_jobinterval,
                        localcores=cellranger_localcores,
                        localmem=cellranger_localmem)
    # Run the command
    print "Running %s" % cmd
    if not dry_run:
        # Sort out the working directory
        if working_dir is None:
            working_dir = os.getcwd()
        else:
            working_dir = os.path.abspath(working_dir)
        # Make a log directory
        if log_dir is None:
            log_dir = os.getcwd()
        else:
            log_dir = os.path.abspath(log_dir)
        # Submit the job
        cellranger_mkfastq_job = SchedulerJob(
            SimpleJobRunner(join_logs=True),
            cmd.command_line,
            name='cellranger_mkfastq',
            working_dir=working_dir,
            log_dir=log_dir)
        cellranger_mkfastq_job.start()
        try:
            cellranger_mkfastq_job.wait()
        except KeyboardInterrupt,ex:
            logger.warning("Keyboard interrupt, terminating cellranger")
            cellranger_mkfastq_job.terminate()
            raise ex
        exit_code = cellranger_mkfastq_job.exit_code
        print "cellranger mkfastq completed: exit code %s" % exit_code
        if exit_code != 0:
            logger.error("cellranger mkfastq exited with an error")
            return exit_code
        # Check outputs and QC summary report
        if not os.path.isdir(flow_cell_dir):
            logger.error("No output directory '%s'" % flow_cell_dir)
            return -1
        json_file = os.path.join(flow_cell_dir,
                                 "outs",
                                 "qc_summary.json")
        if not os.path.exists(json_file):
            logger.error("cellranger mkfastq failed to make "
                         "JSON QC summary file (%s not found)"
                         % json_file)
            return -1
        # Make HTML QC summary
        html_file = os.path.join(working_dir,
                                 "cellranger_qc_summary%s.html" %
                                 lanes_suffix)
        if os.path.exists(html_file):
            logger.warning("Removing existing HTML QC summary file: %s"
                           % html_file)
            os.remove(html_file)
        make_qc_summary_html(json_file,html_file)
        if not os.path.exists(html_file):
            logger.error("Failed to create HTML QC summary file "
                         "(%s not found)" % html_file)
            return -1
        return exit_code