def run_cellranger_count(fastq_dir,
                         reference_data_path,
                         chemistry='auto',
                         cellranger_exe='cellranger',
                         cellranger_jobmode='local',
                         cellranger_maxjobs=None,
                         cellranger_mempercore=None,
                         cellranger_jobinterval=None,
                         cellranger_localcores=None,
                         cellranger_localmem=None,
                         max_jobs=4,
                         log_dir=None,
                         dry_run=False,
                         summary_only=True):
    """
    Wrapper for running 'cellranger count'

    Runs the 10xGenomics 'cellranger count' command to
    perform single library analysis on Fastqs from
    Chromium single-cell samples.

    If the supplied 'fastq_dir' is a 'cellranger mkfastq'
    or 'bcl2fastq' output directory then the analysis
    will be run for each of the projects.

    Arguments:
      fastq_dir (str): path of the 'fastq_path' folder
        from 'cellranger mkfastq', or the output folder
        from 'bcl2fastq' (or with a similar structure),
        or any folder containing Fastq files
      reference_data_path (str): path to the cellranger
        compatible transcriptome reference data
        directory (for scRNA-seq) or ATAC reference
        genome data (for scATAC-seq)
      chemistry (str): assay configuration (set to
        'auto' to let cellranger determine this
        automatically; ignored if not scRNA-seq)
      cellranger_exe (str): optional, name or path to
        cellranger executable (default: "cellranger")
      cellranger_jobmode (str): specify the job mode to
        pass to cellranger (default: "local")
      cellranger_maxjobs (int): specify the maximum
        number of jobs to pass to cellranger (default:
        None)
      cellranger_mempercore (int): specify the memory
        per core (in Gb) to pass to cellranger (default:
        None)
      cellranger_jobinterval (int): specify the interval
        between launching jobs (in ms) to pass to
        cellranger (default: None)
      cellranger_localcores (int): maximum number of cores
        cellranger can request in jobmode 'local'
        (default: None)
      cellranger_localmem (int): maximum memory cellranger
        can request in jobmode 'local' (default: None)
      max_jobs (int): maxiumum number of concurrent
        count jobs to run; also used for maximum number
        of jobs each count pipeline can run at once
        (default: 4)
      log_dir (str): path to a directory to write logs
        (default: current working directory)
      dry_run (bool): if True then only report actions
        that would be performed but don't run anything
      summary_only (bool): if True then only collect
        the output 'web_summary.html' and
        'metrics_summary.csv' files, otherwise
        copy all outputs (warning: this can be very
        large)

    Returns:
      Integer: exit code from the cellranger command.
    """
    # Cellranger mode
    cellranger_mode = os.path.basename(cellranger_exe)
    print "Cellranger mode: %s" % cellranger_mode
    # Input data
    sample_names = {}
    try:
        illumina_data = IlluminaData(os.getcwd(),
                                     unaligned_dir=fastq_dir)
        for project in illumina_data.projects:
            sample_names[project.name] = []
            for sample in project.samples:
                sample_names[project.name].append(sample.name)
    except IlluminaDataError:
        logger.critical("Couldn't load data from '%s'" %
                         fastq_dir)
        return 1
    print "Samples: %s" % sample_names
    projects = sample_names.keys()

    # Set up a scheduler
    sched_reporter = SchedulerReporter(
        job_start="SCHEDULER: Started  #%(job_number)d: %(job_name)s:\n-- %(command)s",
        job_end=  "SCHEDULER: Finished #%(job_number)d: %(job_name)s"
    )
    sched_reporter = SchedulerReporter()
    sched = SimpleScheduler(max_concurrent=max_jobs,
                            reporter=sched_reporter)
    sched.start()

    # Make a log directory
    if not dry_run:
        if log_dir is None:
            log_dir = os.getcwd()
        log_dir = get_numbered_subdir("%s_count" % cellranger_mode,
                                      parent_dir=log_dir,
                                      full_path=True)
        print "Log directory: %s" % log_dir
        mkdirs(log_dir)

    # Submit the cellranger count jobs
    jobs = []
    for project in projects:
        print "Project: %s" % project
        for sample in sample_names[project]:
            print "Sample: %s" % sample
            # Check if outputs already exist
            count_dir = os.path.abspath(
                os.path.join(project,
                             "cellranger_count",
                             sample,
                             "outs"))
            if os.path.isdir(count_dir):
                print "-- %s: outputs exist, nothing to do" % sample
                continue
            else:
                print "-- %s: setting up %s count" % (sample,
                                                      cellranger_mode)
            # Set up job for this sample
            work_dir = os.path.abspath("tmp.%s_count.%s.%s" %
                                       (cellranger_mode,
                                        project,sample))
            mkdirs(work_dir)
            print "Working directory: %s" % work_dir
            cmd = Command(cellranger_exe,
                          "count",
                          "--id",sample,
                          "--fastqs",os.path.abspath(fastq_dir),
                          "--sample",sample)
            if cellranger_mode == "cellranger":
                cmd.add_args("--transcriptome",reference_data_path,
                             "--chemistry",chemistry)
            elif cellranger_mode == "cellranger-atac":
                cmd.add_args("--reference",reference_data_path)
            add_cellranger_args(cmd,
                                jobmode=cellranger_jobmode,
                                mempercore=cellranger_mempercore,
                                maxjobs=cellranger_maxjobs,
                                jobinterval=cellranger_jobinterval,
                                localcores=cellranger_localcores,
                                localmem=cellranger_localmem)
            print "Running: %s" % cmd
            if not dry_run:
                job = sched.submit(cmd,
                                   name="%s_count.%s.%s" %
                                   (cellranger_mode,
                                    project,
                                    sample),
                                   log_dir=log_dir,
                                   wd=work_dir)
                jobs.append(job)
    sched.wait()
    sched.stop()

    # If dry run then stop here
    if dry_run:
        return 0

    # Finished, check the exit status
    retval = 0
    for job in jobs:
        retval += job.exit_code
    if retval != 0:
        logger.critical("One or more jobs finished with non-zero "
                         "exit code")
        return retval

    # Handle outputs
    for project in projects:
        print "Project: %s" % project
        for sample in sample_names[project]:
            print "Sample: %s" % sample
            # Destination for count output
            count_dir = os.path.abspath(
                os.path.join(project,
                             "cellranger_count",
                             sample))
            mkdirs(count_dir)
            # Copy the cellranger count outputs
            outs_dir = os.path.join("tmp.%s_count.%s.%s"
                                    % (cellranger_mode,
                                       project,sample),
                                    sample,
                                    "outs")
            if not summary_only:
                # Collect all outputs
                print "Copying contents of %s to %s" % (outs_dir,count_dir)
                shutil.copytree(outs_dir,count_dir)
            else:
                # Only collect the web and csv summaries
                if cellranger_mode == 'cellranger':
                    files = ("web_summary.html","metrics_summary.csv")
                elif cellranger_mode == 'cellranger-atac':
                    files = ("web_summary.html","summary.csv")
                count_dir = os.path.join(count_dir,"outs")
                mkdirs(count_dir)
                for f in files:
                    path = os.path.join(outs_dir,f)
                    if not os.path.exists(path):
                        logger.warning("%s: not found in %s" % (f,outs_dir))
                        retval = 1
                    else:
                        print "Copying %s from %s to %s" % (f,
                                                            outs_dir,
                                                            count_dir)
                        shutil.copy(path,count_dir)
                # Stop if there was an error
                if retval != 0:
                    logger.critical("Some cellranger count outputs are "
                                    "missing")
                    return retval

    # Create a report and zip archive for each project
    pwd = os.getcwd()
    analysis_dir = os.path.basename(pwd)
    for project in projects:
        # Descend into project dir
        os.chdir(project)
        # Set up zip file
        report_zip = os.path.join("cellranger_count_report.%s.%s.zip" %
                                  (project,analysis_dir))
        zip_file = ZipArchive(report_zip,
                              prefix="cellranger_count_report.%s.%s" %
                              (project,analysis_dir))
        # Construct index page
        print "Making report for project %s" % project
        count_report = Document("%s: cellranger count" % project)
        count_report.add_css_rule(css_rules.QC_REPORT_CSS_RULES)
        summaries = count_report.add_section()
        summaries.add("Reports from cellranger count for each sample:")
        summary_links = List()
        for sample in sample_names[project]:
            # Link to summary for sample
            web_summary = os.path.join("cellranger_count",
                                       sample,
                                       "outs",
                                       "web_summary.html")
            print "Adding web summary (%s) for %s" % (web_summary,
                                                      sample)
            summary_links.add_item(Link("%s" % sample,
                                        web_summary))
            # Add to the zip file
            zip_file.add_file(web_summary)
        summaries.add(summary_links)
        # Write the report and add to the zip file
        html_file = "cellranger_count_report.html"
        count_report.write(html_file)
        zip_file.add_file(html_file)
        # Finish
        zip_file.close()
        os.chdir(pwd)
    # Done
    return retval
    def __init__(self,bcl2fastq_dir=None,sample_sheet=None):
        """
        Create a new AnalyseBarcodes pipeline instance

        At least one of the bcl2fastq output directory
        or sample sheet must be supplied when the
        pipeline is instantiated.

        If the bcl2fastq output directory is supplied
        on initialisation then it must exist and
        already contain output Fastq files.

        It is possible to set the pipeline up before the
        bcl2fastq outputs have been generated, as long
        as the sample sheet is supplied. The bcl2fastq
        output directory must then be supplied as an
        input when the pipeline is executed via the
        'run' method.

        Arguments:
          bcl2fastq_dir (str): path to the directory
            with outputs from bcl2fastq
          sample_sheet (str): path to the sample sheet
            file
        """
        # Initialise the pipeline superclass
        Pipeline.__init__(self,name="Analyse Barcodes")

        # Internal parameters
        self._bcl2fastq_dir = bcl2fastq_dir
        self._sample_sheet = sample_sheet

        # Define parameters
        self.add_param('bcl2fastq_dir',value=self._bcl2fastq_dir,type=str)
        self.add_param('sample_sheet',value=self._sample_sheet,type=str)
        self.add_param('barcode_analysis_dir',type=str)
        self.add_param('counts_dir',type=str)
        self.add_param('title',type=str)
        self.add_param('lanes',type=list)
        self.add_param('bases_mask',type=str)
        self.add_param('mismatches',type=int)
        self.add_param('cutoff',type=float,value=0.001)
        self.add_param('force',type=bool,value=False)

        # Get a list of projects
        if self._bcl2fastq_dir is not None:
            # Load data from bcl2fastq output
            try:
                analysis_dir = os.path.abspath(
                    os.path.dirname(self._bcl2fastq_dir))
                bcl2fastq_dir = os.path.basename(self._bcl2fastq_dir)
                illumina_data = IlluminaData(analysis_dir,
                                             unaligned_dir=bcl2fastq_dir)
            except Exception as ex:
                raise Exception("Unaligned dir '%s' supplied but can't "
                                "load data" % self._bcl2fastq_dir)
            # Get a list of projects
            projects = [p.name for p in illumina_data.projects]
        elif self._sample_sheet is not None:
            # Load data from sample sheet
            try:
                s = SampleSheet(self._sample_sheet)
                # List of unique project names
                projects = list(set(
                    [d[s.sample_project_column]
                     if d[s.sample_project_column]
                     else d[s.sample_id_column]
                     for d in s]))
            except Exception as ex:
                raise Exception("Sample sheet '%s' supplied but can't "
                                "get a list of project names" %
                                self._sample_sheet)
            # Check any empty barcode sequences
            self._check_sample_sheet_indexes(self._sample_sheet)
        else:
            raise Exception("Need to supply either unaligned (bcl2fastq "
                            "output) dir or sample sheet")

        self.report("Expecting projects:")
        for p in projects:
            self.report("- %s" % p)

        ####################
        # Build the pipeline
        ####################

        # Setup barcode analysis and counts directories
        setup_barcode_analysis_dir = SetupBarcodeAnalysisDirs(
            "Setup barcode analysis directory",
            self.params.barcode_analysis_dir,
            self.params.counts_dir,
            force=self.params.force)
        self.add_task(setup_barcode_analysis_dir)

        # Load the data from the unaligned/bcl2fastq output dir
        load_illumina_data = LoadIlluminaData(
            "Load Fastq data for barcode analysis",
            self.params.bcl2fastq_dir)
        self.add_task(load_illumina_data)

        # Generate counts for each project
        count_tasks = []
        for project in projects:
            count_barcodes = CountBarcodes(
                "Count barcodes in '%s'" % project,
                load_illumina_data.output.illumina_data,
                project,
                self.params.counts_dir,
                lanes=self.params.lanes)
            self.add_task(count_barcodes,
                          requires=(setup_barcode_analysis_dir,
                                    load_illumina_data))
            count_tasks.append(count_barcodes)

        # Get counts for 'undetermined'
        count_barcodes = CountBarcodes(
            "Count barcodes in 'undetermined'",
            load_illumina_data.output.illumina_data,
            "__undetermined__",
            self.params.counts_dir,
            lanes=self.params.lanes,
            use_project_name="undetermined")
        self.add_task(count_barcodes,
                      requires=(setup_barcode_analysis_dir,
                                load_illumina_data))
        count_tasks.append(count_barcodes)

        # List the counts files
        list_counts_files = ListBarcodeCountFiles(
            "Gather the barcode counts files",
            self.params.counts_dir)
        self.add_task(list_counts_files,
                      requires=count_tasks)

        # Analyse counts and report the results
        report_barcodes = ReportBarcodeAnalysis(
            "Report barcode analysis",
            list_counts_files.output.counts_files,
            self.params.barcode_analysis_dir,
            sample_sheet=self.params.sample_sheet,
            lanes=self.params.lanes,
            mismatches=self.params.mismatches,
            cutoff=self.params.cutoff,
            title=self.params.title
        )
        self.add_task(report_barcodes,
                      requires=(list_counts_files,))

        # Add final outputs to the pipeline
        self.add_output('report_file',report_barcodes.output.report_file)
        self.add_output('xls_file',report_barcodes.output.xls_file)
        self.add_output('html_file',report_barcodes.output.html_file)
Example #3
0
def setup(ap,
          data_dir,
          analysis_dir=None,
          sample_sheet=None,
          extra_files=None,
          unaligned_dir=None):
    """
    Set up the initial analysis directory

    This does all the initialisation of the analysis directory
    and processing parameters

    Arguments:
      ap (AutoProcess): autoprocessor pointing to the analysis
        directory to create Fastqs for
      data_dir (str): source data directory
      analysis_dir (str): corresponding analysis directory
      sample_sheet (str): name and location of non-default
        sample sheet file; can be a local or remote file, or
        a URL (optional, will use sample sheet from the
        source data directory if present)
      extra_files (list): arbitrary additional files to copy
        into the new analysis directory; each file can be a
        local or remote file or a URL
      unaligned_dir (str): directory with existing Fastqs
        output from CASAVA or bcl2fastq2; if specified then
        Fastqs will be taken from this directory (optional)
    """
    data_dir = data_dir.rstrip(os.sep)
    if not exists(data_dir):
        raise Exception("Data directory '%s' not found" % data_dir)
    if not Location(data_dir).is_remote:
        data_dir = os.path.abspath(data_dir)
    run_name = os.path.basename(data_dir)
    if analysis_dir is None:
        analysis_dir = os.path.join(os.getcwd(), run_name) + '_analysis'
    else:
        analysis_dir = os.path.abspath(analysis_dir)
    # Create the analysis directory structure
    if not os.path.exists(analysis_dir):
        # Make a temporary analysis dir
        tmp_analysis_dir = os.path.join(
            os.path.dirname(analysis_dir),
            ".%s.%s" % (os.path.basename(analysis_dir), uuid.uuid4()))
        ap.analysis_dir = tmp_analysis_dir
        logger.debug("Creating temp directory '%s'" % ap.analysis_dir)
        # Create directory structure
        ap.create_directory(ap.analysis_dir)
        ap.log_dir
        ap.script_code_dir
    else:
        # Directory already exists
        logger.warning("Analysis directory '%s' already exists" % analysis_dir)
        ap.analysis_dir = analysis_dir
        # check for parameter file
        if ap.has_parameter_file:
            ap.load_parameters()
        else:
            logger.warning("No parameter file found in %s" % ap.analysis_dir)
    # Run datestamp, instrument name and instrument run number
    try:
        datestamp,instrument,run_number,flow_cell_prefix,flow_cell_id = \
                                    split_run_name_full(run_name)
        run_number = run_number.lstrip('0')
        flow_cell = flow_cell_prefix + flow_cell_id
    except Exception as ex:
        logger.warning("Unable to extract information from run name '%s'" \
                       % run_name)
        logger.warning("Exception: %s" % ex)
        datestamp = None
        instrument = None
        run_number = None
        flow_cell = None
    # Identify missing data and attempt to acquire
    # Sequencing platform
    platform = ap.metadata.platform
    if platform is None:
        platform = get_sequencer_platform(data_dir,
                                          instrument=instrument,
                                          settings=ap.settings)
    print("Platform identified as '%s'" % platform)
    # Sequencer model
    model = ap.metadata.sequencer_model
    if model is None:
        try:
            model = ap.settings.sequencers[instrument]['model']
        except KeyError:
            pass
    if model:
        print("Sequencer model identified as '%s'" % model)
    # Log dir
    ap.set_log_dir(ap.get_log_subdir('setup'))
    # Attempt to acquire sample sheet
    try:
        # Custom SampleSheet.csv file
        custom_sample_sheet = ap.params.sample_sheet
        if custom_sample_sheet is not None:
            # Sample sheet already stored
            original_sample_sheet = os.path.join(ap.analysis_dir,
                                                 'SampleSheet.orig.csv')
            print("Sample sheet '%s'" % custom_sample_sheet)
        else:
            # Look for sample sheet
            print("Acquiring sample sheet...")
            if sample_sheet is None:
                targets = (
                    'Data/Intensities/BaseCalls/SampleSheet.csv',
                    'SampleSheet.csv',
                )
            else:
                targets = (sample_sheet, )
            # Try each possibility until one sticks
            for target in targets:
                target = Location(target)
                tmp_sample_sheet = os.path.join(ap.tmp_dir,
                                                os.path.basename(target.path))
                if target.is_url:
                    # Try fetching samplesheet from URL
                    print("Trying '%s'" % target.url)
                    try:
                        urlfp = urlopen(target.url)
                        with open(tmp_sample_sheet, 'w') as fp:
                            fp.write(urlfp.read().decode())
                    except URLError as ex:
                        # Failed to download from URL
                        raise Exception("Error fetching sample sheet data "
                                        "from '%s': %s" % (target.url, ex))
                else:
                    # Assume target samplesheet is a file on a local
                    # or remote server
                    if target.is_remote:
                        target_sample_sheet = str(target)
                    else:
                        if os.path.isabs(target.path):
                            target_sample_sheet = target.path
                        else:
                            target_sample_sheet = os.path.join(
                                data_dir, target.path)
                    print("Trying '%s'" % target_sample_sheet)
                    rsync = general_applications.rsync(target_sample_sheet,
                                                       ap.tmp_dir)
                    print("%s" % rsync)
                    status = rsync.run_subprocess(
                        log=ap.log_path('rsync.sample_sheet.log'))
                    if status != 0:
                        logger.warning("Failed to fetch sample sheet '%s'" %
                                       target_sample_sheet)
                        tmp_sample_sheet = None
                    else:
                        break
            # Bail out if no sample sheet was acquired
            if tmp_sample_sheet is None:
                raise Exception("Unable to acquire sample sheet")
            # Keep a copy of the original sample sheet
            original_sample_sheet = os.path.join(ap.analysis_dir,
                                                 'SampleSheet.orig.csv')
            print("Copying original sample sheet to %s" %
                  original_sample_sheet)
            shutil.copyfile(tmp_sample_sheet, original_sample_sheet)
            # Set the permissions for the original SampleSheet
            os.chmod(original_sample_sheet, 0o664)
            # Process acquired sample sheet
            custom_sample_sheet = os.path.join(ap.analysis_dir,
                                               'custom_SampleSheet.csv')
            make_custom_sample_sheet(tmp_sample_sheet, custom_sample_sheet)
    except Exception as ex:
        # Failed to acquire sample sheet
        if not unaligned_dir:
            # Fatal error
            try:
                # Remove temporary directory
                shutil.rmtree(tmp_analysis_dir)
                ap.analysis_dir = None
            except Exception:
                pass
            raise Exception("Failed to acquire sample sheet: %s" % ex)
        else:
            # Don't need sample sheet if Fastqs already exist
            original_sample_sheet = None
            custom_sample_sheet = None
    # Bases mask
    print("Bases mask set to 'auto' (will be determined at run time)")
    bases_mask = "auto"
    # Data source metadata
    data_source = ap.settings.metadata.default_data_source
    # Generate and print predicted outputs and warnings
    if custom_sample_sheet is not None:
        sample_sheet_data = SampleSheet(custom_sample_sheet)
        print(predict_outputs(sample_sheet=sample_sheet_data))
        check_and_warn(sample_sheet=sample_sheet_data)
    # Import additional files
    if extra_files:
        for extra_file in extra_files:
            print("Importing '%s'" % extra_file)
            extra_file = Location(extra_file)
            if extra_file.is_url:
                # Try fetching file from URL
                try:
                    urlfp = urlopen(extra_file.url)
                    with open(
                            os.path.join(ap.analysis_dir,
                                         os.path.basename(extra_file.path)),
                            'w') as fp:
                        fp.write(urlfp.read().decode())
                except URLError as ex:
                    # Failed to download from URL
                    raise Exception("Error fetching '%s': %s" %
                                    (extra_file.url, ex))
            else:
                # File is on a local or remote server
                if extra_file.is_remote:
                    extra_file_path = str(extra_file)
                else:
                    extra_file_path = os.path.abspath(extra_file.path)
                rsync = general_applications.rsync(extra_file_path,
                                                   ap.analysis_dir)
                status = rsync.run_subprocess(
                    log=ap.log_path('rsync.extra_file.log'))
                if status != 0:
                    raise Exception("Failed to fetch '%s'" % extra_file_path)
    # Check supplied unaligned Fastq dir
    if unaligned_dir is not None:
        try:
            illumina_data = IlluminaData(data_dir, unaligned_dir=unaligned_dir)
            unaligned_dir = illumina_data.unaligned_dir
        except IlluminaDataError:
            # Fatal error
            try:
                # Remove temporary directory
                shutil.rmtree(tmp_analysis_dir)
                ap.analysis_dir = None
            except Exception:
                pass
            raise Exception("Can't get data from Fastq dir '%s'" %
                            unaligned_dir)
    else:
        # No unaligned dir supplied
        unaligned_dir = ap.params.unaligned_dir
    # Move analysis dir to final location if necessary
    if ap.analysis_dir != analysis_dir:
        logger.debug("Moving %s to final directory" % ap.analysis_dir)
        os.rename(ap.analysis_dir, analysis_dir)
        ap.analysis_dir = analysis_dir
        # Update the custom sample sheet path
        if custom_sample_sheet is not None:
            custom_sample_sheet = os.path.join(
                analysis_dir, os.path.basename(custom_sample_sheet))
        print("Created analysis directory '%s'" % ap.analysis_dir)
    # Store the parameters
    ap.params['data_dir'] = data_dir
    ap.params['analysis_dir'] = ap.analysis_dir
    ap.params['sample_sheet'] = custom_sample_sheet
    ap.params['bases_mask'] = bases_mask
    ap.params['unaligned_dir'] = unaligned_dir
    ap.params['acquired_primary_data'] = False
    # Store the metadata
    ap.metadata['run_name'] = ap.run_name
    ap.metadata['platform'] = platform
    ap.metadata['instrument_name'] = instrument
    ap.metadata['instrument_datestamp'] = datestamp
    ap.metadata['instrument_run_number'] = run_number
    ap.metadata['instrument_flow_cell_id'] = flow_cell
    ap.metadata['sequencer_model'] = model
    ap.metadata['source'] = data_source
    # Make a 'projects.info' metadata file
    if not ap.params.project_metadata:
        if unaligned_dir is not None:
            ap.make_project_metadata_file()
    # Set flags to allow parameters etc to be saved back
    ap._save_params = True
    ap._save_metadata = True
    ap.save_data()
 def setup(self):
     # Load the data from bcl2fastq
     print("Loading data from %s" % self.args.bcl2fastq_dir)
     illumina_data = IlluminaData(os.path.dirname(self.args.bcl2fastq_dir),
                                  os.path.basename(self.args.bcl2fastq_dir))
     self.output.illumina_data.set(illumina_data)