def cellranger_info(path=None,name=None):
    """
    Retrieve information on the cellranger software

    If called without any arguments this will locate the first
    cellranger executable that is available on the user's PATH,
    and attempts to extract the version.

    Alternatively if the path to an executable is supplied then
    the version will be determined from that instead.

    If no version is identified then the script path is still
    returned, but without any version info.

    If a 'path' is supplied then the package name will be taken
    from the basename; otherwise the package name can be supplied
    via the 'name' argument. If neither are supplied then the
    package name defaults to 'cellranger'.

    Returns:
      Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH
        is the full path for the cellranger program, PACKAGE is
        'cellranger', and VERSION is the package version. If any
        value can't be determined then it will be returned as an
        empty string.
    """
    # Initialise
    cellranger_path = ''
    if name is None:
        if path:
            name = os.path.basename(path)
        else:
            name = 'cellranger'
    package_name = name
    package_version = ''
    # Locate the core script
    if not path:
        cellranger_path = find_program(package_name)
    else:
        cellranger_path = os.path.abspath(path)
    # Identify the version
    if os.path.basename(cellranger_path) == package_name:
        # Run the program to get the version
        version_cmd = Command(cellranger_path,'--version')
        output = version_cmd.subprocess_check_output()[1]
        for line in output.split('\n'):
            if line.startswith(package_name):
                # Extract version from line of the form
                # cellranger  (2.0.1)
                try:
                    package_version = line.split('(')[-1].strip(')')
                except Exception as ex:
                    logger.warning("Unable to get version from '%s': %s" %
                                   (line,ex))
    else:
        # No package supplied or located
        logger.warning("Unable to identify cellranger package "
                       "from '%s'" % cellranger_path)
    # Return what we found
    return (cellranger_path,package_name,package_version)
예제 #2
0
def bclconvert_info(path=None):
    """
    Retrieve information on the bcl-convert software

    If called without any arguments this will locate the first
    bcl-concert executable that is available on the user's PATH.

    Alternatively if the path to an executable is supplied then
    the package name and version will be determined from that
    instead.

    If no package is identified then the script path is still
    returned, but without any version info.

    Returns:
      Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH
        is the full path for the bcl-convert program, and PACKAGE
        and VERSION the package/version that it belongs to (PACKAGE
        will be 'BCL Convert' if a matching executable is located).
        If any value can't be determined then it will be returned
        as an empty string.

    """
    # Initialise
    bclconvert_path = ''
    package_name = ''
    package_version = ''
    # Locate the bcl-convert program
    if not path:
        bclconvert_path = bcf_utils.find_program('bcl-convert')
    else:
        bclconvert_path = os.path.abspath(path)
    # Identify the version
    if bclconvert_path:
        # Run the program to get the version
        version_cmd = Command('bcl-convert', '-V')
        output = version_cmd.subprocess_check_output()[1]
        print(output)
        for line in output.split('\n'):
            if line.startswith('bcl-convert'):
                # Extract version from line of the form
                # bcl-convert Version 00.000.000.3.7.5
                package_name = 'BCL Convert'
                try:
                    package_version = '.'.join(line.split('.')[-3:])
                except Exception as ex:
                    logger.warning("Unable to get version from '%s': %s" %
                                   (line, ex))
    else:
        # No package supplied or located
        logger.warning("Unable to identify BCLConvert package from '%s'" %
                       bclconvert_path)
    # Return what we found
    return (bclconvert_path, package_name, package_version)
예제 #3
0
 def info_func(p):
     name = os.path.basename(p)
     exe = find_program(p)
     version = ''
     output = Command(exe).subprocess_check_output()[1]
     for line in output.split('\n'):
         if line.startswith(name):
             try:
                 version = line.split()[1]
             except IndexError:
                 pass
             break
     return (exe, name.upper(), version)
예제 #4
0
    def __init__(self, conda=None, env_dir=None, channels=None):
        """
        Create a new CondaWrapper instance

        Arguments:
          conda (str): path to conda executable
          env_dir (str): optional, non-default directory
            for conda environments
          channels (list): optional, list of non-default
            channels to use for installing packages
        """
        # Conda executable
        if conda is None:
            conda = find_program("conda")
        self._conda = conda
        if self._conda:
            self._conda = os.path.abspath(self._conda)
            conda_dir = os.sep.join(self._conda.split(os.sep)[:-2])
        else:
            conda_dir = None
        self._conda_dir = conda_dir
        # Default location for environments
        if env_dir:
            env_dir = os.path.abspath(env_dir)
        elif self._conda_dir:
            env_dir = os.path.join(self._conda_dir, 'envs')
        self._env_dir = env_dir
        # Channels
        if channels:
            channels = [c for c in channels]
        elif channels is None:
            channels = DEFAULT_CONDA_CHANNELS
        else:
            channels = list()
        self._channels = channels
        # Lock for blocking operations
        self._lock_manager = ResourceLock()
예제 #5
0
    # Screen files
    mammalian_conf = args.mammalian_conf
    if mammalian_conf is not None:
        mammalian_conf = os.path.abspath(mammalian_conf)
    contaminants_conf = args.contaminants_conf
    if contaminants_conf is not None:
        contaminants_conf = os.path.abspath(contaminants_conf)

    # Check for underlying programs
    required = ["fastq_screen"]
    if args.aligner is not None:
        required.append(args.aligner)
    else:
        logging.warning("Aligner not specified, cannot check")
    for prog in required:
        if find_program(prog) is None:
            logging.critical("couldn't find '%s'" % prog)
            sys.exit(1)

    # Make output dir
    if args.out_dir is not None:
        out_dir = os.path.abspath(args.out_dir)
        mkdir(out_dir)
    else:
        out_dir = os.getcwd()

    # Screen against 'mammalian' genomes
    tagged_fastq = fastq_screen_tag(mammalian_conf,
                                    fqr2,
                                    aligner=args.aligner,
                                    threads=args.threads,
    def has_exe(self):
        """Check if the command executable exists

        """
        return (find_program(self.command) is not None)
    def has_exe(self):
        """Check if the command executable exists

        """
        return (find_program(self.command) is not None)
예제 #8
0
def fastq_strand(argv, working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1", metavar="READ1", default=None, help="R1 Fastq file")
    p.add_argument("r2",
                   metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g",
                   "--genome",
                   dest="star_genomedirs",
                   metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o",
                   "--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c",
                   "--conf",
                   metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print("READ1\t: %s" % args.r1)
    print("READ2\t: %s" % args.r2)
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print("STAR\t: %s" % star_exe)
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print("Conf file\t: %s" % args.conf)
        star_genomedirs = []
        with open(args.conf, 'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name, star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print("Genomes:")
    for genome in star_genomedirs:
        print("- %s" % genome)
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" % outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir, os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print("Working directory: %s" % working_dir)
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print("%d reads" % nreads)
    if args.subset == 0:
        print("Using all read pairs in Fastq files")
        subset = nreads
    elif args.subset > nreads:
        print("Actual number of read pairs smaller than requested subset")
        subset = nreads
    else:
        subset = args.subset
        print("Using random subset of %d read pairs" % subset)
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads), subset)
    fqs_in = filter(lambda fq: fq is not None, (args.r1, args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir, os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset, 'w') as fp:
            for read in getreads_subset(os.path.abspath(fq), subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(
            outdir, "STAR.%s.outputs" %
            os.path.basename(strip_ngs_extensions(args.r1)))
        print("Output from STAR will be copied to %s" % star_output_dir)
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir, i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir, backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode', 'alignReads', '--genomeLoad', 'NoSharedMemory',
                '--genomeDir',
                os.path.abspath(star_genomedir)
            ])
            star_cmd.extend(['--readFilesIn', fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode', 'GeneCounts', '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMstrandField', 'intronMotif', '--outFileNamePrefix',
                prefix, '--runThreadN',
                str(args.n)
            ])
            print("Running %s" % ' '.join(star_cmd))
            try:
                subprocess.check_output(star_cmd, cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep, "_"))
                print("Copying STAR outputs to %s" % genome_dir)
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir, f),
                                    os.path.join(genome_dir, f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i, line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print("Sums:")
            print("- col2: %d" % sum_col2)
            print("- col3: %d" % sum_col3)
            print("- col4: %d" % sum_col4)
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3) / float(sum_col2) * 100.0
                reverse_2nd = float(sum_col4) / float(sum_col2) * 100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print("Strand percentages:")
            print("- 1st forward: %.2f%%" % forward_1st)
            print("- 2nd reverse: %.2f%%" % reverse_2nd)
            # Append to output file
            data = [name, "%.2f" % forward_1st, "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2, sum_col3, sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile, 'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__, "STAR", subset))
            columns = ["Genome", "1st forward", "2nd reverse"]
            if args.counts:
                columns.extend([
                    "Unstranded", "1st read strand aligned",
                    "2nd read strand aligned"
                ])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0
예제 #9
0
def main():
    """
    """
    # Load configuration
    settings = Settings()

    # Collect defaults
    default_runner = settings.runners.rsync

    # Get pre-defined destinations
    destinations = [name for name in settings.destination]

    # Command line
    p = argparse.ArgumentParser(
        description="Transfer copies of Fastq data from an analysis "
        "project to an arbitrary destination for sharing with other "
        "people")
    p.add_argument('--version',
                   action='version',
                   version=("%%(prog)s %s" % get_version()))
    p.add_argument('--subdir',
                   action='store',
                   choices=('random_bin', 'run_id'),
                   default=None,
                   help="subdirectory naming scheme: 'random_bin' "
                   "locates a random pre-existing empty subdirectory "
                   "under the target directory; 'run_id' creates a "
                   "new subdirectory "
                   "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this "
                   "option is not set then no subdirectory will be "
                   "used")
    p.add_argument('--readme',
                   action='store',
                   metavar='README_TEMPLATE',
                   dest='readme_template',
                   help="template file to generate README file from; "
                   "can be full path to a template file, or the name "
                   "of a file in the 'templates' directory")
    p.add_argument('--weburl',
                   action='store',
                   help="base URL for webserver (sets the value of "
                   "the WEBURL variable in the template README)")
    p.add_argument('--include_downloader',
                   action='store_true',
                   help="copy the 'download_fastqs.py' utility to the "
                   "final location")
    p.add_argument('--include_qc_report',
                   action='store_true',
                   help="copy the zipped QC reports to the final "
                   "location")
    p.add_argument('--include_10x_outputs',
                   action='store_true',
                   help="copy outputs from 10xGenomics pipelines (e.g. "
                   "'cellranger count') to the final location")
    p.add_argument('--link',
                   action='store_true',
                   help="hard link files instead of copying")
    p.add_argument('--runner',
                   action='store',
                   help="specify the job runner to use for executing "
                   "the checksumming, Fastq copy and tar gzipping "
                   "operations (defaults to job runner defined for "
                   "copying in config file [%s])" % default_runner)
    p.add_argument('dest',
                   action='store',
                   metavar="DEST",
                   help="destination to copy Fastqs to; can be the "
                   "name of a destination defined in the configuration "
                   "file, or an arbitrary location of the form "
                   "'[[USER@]HOST:]DIR' (%s)" %
                   (("available destinations: %s" %
                     (','.join("'%s'" % d for d in sorted(destinations))))
                    if destinations else "no destinations currently defined"))
    p.add_argument('project',
                   action='store',
                   metavar="PROJECT",
                   help="path to project directory (or to a Fastqs "
                   "subdirectory in a project) to copy Fastqs from")

    # Process command line
    args = p.parse_args()

    # Check if target is pre-defined destination
    if args.dest in destinations:
        print("Loading settings for destination '%s'" % args.dest)
        dest = settings.destination[args.dest]
        target_dir = dest.directory
        readme_template = dest.readme_template
        subdir = dest.subdir
        include_downloader = dest.include_downloader
        include_qc_report = dest.include_qc_report
        hard_links = dest.hard_links
        weburl = dest.url
    else:
        target_dir = args.dest
        readme_template = None
        subdir = None
        include_downloader = False
        include_qc_report = False
        hard_links = False
        weburl = None

    # Update defaults with command line values
    if args.readme_template:
        readme_template = args.readme_template
    if args.subdir:
        subdir = args.subdir
    if args.include_downloader:
        include_downloader = True
    if args.include_qc_report:
        include_qc_report = True
    if args.weburl:
        weburl = args.weburl
    if args.link:
        hard_links = args.link

    # Sort out project directory
    project = AnalysisProject(args.project)
    if not project.is_analysis_dir:
        # Assume it's the Fastq dir
        fastq_dir = os.path.basename(args.project)
        project = AnalysisProject(os.path.dirname(args.project))
    else:
        fastq_dir = None
    if not project.is_analysis_dir:
        logger.error("'%s': project not found" % args.project)
        return 1
    project_name = project.name

    # Parent analysis directory
    analysis_dir = AnalysisDir(os.path.dirname(project.dirn))

    # Fastqs directory
    try:
        project.use_fastq_dir(fastq_dir)
    except Exception as ex:
        logger.error("'%s': failed to load Fastq set '%s': %s" %
                     (project.name, fastq_dir, ex))
        return 1

    # Report
    print("Transferring data from '%s' (%s)" % (project.name, project.dirn))
    print("Fastqs in %s" % project.fastq_dir)

    # Summarise samples and Fastqs
    samples = set()
    nfastqs = 0
    fsize = 0
    for sample in project.samples:
        samples.add(sample.name)
        for fq in sample.fastq:
            fsize += os.lstat(fq).st_size
            nfastqs += 1
    nsamples = len(samples)
    dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform
                                if project.info.single_cell_platform else '',
                                project.info.library_type)
    endedness = "paired-end" if project.info.paired_end else "single-end"
    print("%s with %d Fastqs from %d %s sample%s totalling %s" %
          (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '',
           format_file_size(fsize)))

    # Check target dir
    if not Location(target_dir).is_remote:
        target_dir = os.path.abspath(target_dir)
    if not exists(target_dir):
        print("'%s': target directory not found" % target_dir)
        return
    else:
        print("Target directory %s" % target_dir)

    # Locate downloader
    if include_downloader:
        print("Locating downloader for inclusion")
        downloader = find_program("download_fastqs.py")
        if downloader is None:
            logging.error("Unable to locate download_fastqs.py")
            return 1
        print("... found %s" % downloader)
    else:
        downloader = None

    # Locate zipped QC report
    if include_qc_report:
        print("Locating zipped QC reports for inclusion")
        qc_zips = list()
        # Check QC directories and look for zipped reports
        for qc_dir in project.qc_dirs:
            # Get the associated Fastq set
            # NB only compare the basename of the Fastq dir
            # in case full paths weren't updated
            fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir)
            if fq_set == os.path.basename(project.fastq_dir):
                for qc_base in (
                        "%s_report.%s.%s" %
                    (qc_dir, project.name, project.info.run),
                        "%s_report.%s.%s" %
                    (qc_dir, project.name,
                     os.path.basename(analysis_dir.analysis_dir)),
                ):
                    qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base)
                    if os.path.exists(qc_zip):
                        print("... found %s" % qc_zip)
                        qc_zips.append(qc_zip)
        if not qc_zips:
            logger.error("No zipped QC reports found")
            return 1
    else:
        qc_zips = None

    # Locate 10xGenomics outputs
    if args.include_10x_outputs:
        print("Locating outputs from 10xGenomics pipelines for " "inclusion")
        cellranger_dirs = list()
        for d in (
                'cellranger_count',
                'cellranger_multi',
        ):
            cellranger_dir = os.path.join(project.dirn, d)
            if os.path.isdir(cellranger_dir):
                print("... found %s" % cellranger_dir)
                cellranger_dirs.append(cellranger_dir)
        if not cellranger_dirs:
            logger.error("No outputs from 10xGenomics pipelines found")
            return 1
    else:
        cellranger_dirs = None

    # Determine subdirectory
    if subdir == "random_bin":
        # Find a random empty directory under the
        # target directory
        print("Locating random empty bin")
        subdirs = [
            d for d in os.listdir(target_dir)
            if os.path.isdir(os.path.join(target_dir, d))
        ]
        if not subdirs:
            print("Failed to locate subdirectories")
            return
        shuffle(subdirs)
        subdir = None
        for d in subdirs:
            if not os.listdir(os.path.join(target_dir, d)):
                # Empty bin
                subdir = d
                break
        if subdir is None:
            print("Failed to locate empty subdirectory")
            return
        print("... found '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)
    elif subdir == "run_id":
        # Construct subdirectory name based on the
        # run ID
        subdir = "{platform}_{datestamp}.{run_number}-{project}".format(
            platform=analysis_dir.metadata.platform.upper(),
            datestamp=analysis_dir.metadata.instrument_datestamp,
            run_number=analysis_dir.metadata.run_number,
            project=project.name)
        # Check it doesn't already exist
        if exists(os.path.join(target_dir, subdir)):
            logger.error("'%s': subdirectory already exists" % subdir)
            return
        print("Using subdirectory '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)

    # Make target directory
    if not exists(target_dir):
        mkdir(target_dir)

    # Get runner for copy job
    if args.runner:
        runner = fetch_runner(args.runner)
    else:
        runner = default_runner

    # Set identifier for jobs
    job_id = "%s%s" % (project_name,
                       (".%s" % fastq_dir if fastq_dir is not None else ''))

    # Set the working directory
    working_dir = os.path.abspath("transfer.%s.%s" %
                                  (job_id, int(time.time())))
    mkdir(working_dir)
    print("Created working dir %s" % working_dir)

    # Construct the README
    if readme_template:
        # Check that template file exists
        print("Locating README template")
        template = None
        for filen in (
                readme_template,
                os.path.join(get_templates_dir(), readme_template),
        ):
            if os.path.exists(filen):
                template = filen
                break
        if template is None:
            logger.error("'%s': template file not found" % readme_template)
            return 1
        else:
            readme_template = template
        print("... found %s" % readme_template)
        # Read in template
        with open(readme_template, 'rt') as fp:
            readme = fp.read()
        # Substitute template variables
        template_vars = {
            'PLATFORM': analysis_dir.metadata.platform.upper(),
            'RUN_NUMBER': analysis_dir.metadata.run_number,
            'DATESTAMP': analysis_dir.metadata.instrument_datestamp,
            'PROJECT': project_name,
            'WEBURL': weburl,
            'BIN': subdir,
            'DIR': target_dir,
            'TODAY': date.today().strftime("%d/%m/%Y"),
        }
        for var in template_vars:
            value = template_vars[var]
            if value is None:
                value = '?'
            else:
                value = str(value)
            readme = re.sub(r"%{var}%".format(var=var), value, readme)
        # Write out a temporary README file
        readme_file = os.path.join(working_dir, "README")
        with open(readme_file, 'wt') as fp:
            fp.write(readme)
    else:
        # No README
        readme_file = None

    # Start a scheduler to run jobs
    sched = SimpleScheduler(runner=runner,
                            reporter=TransferDataSchedulerReporter(),
                            poll_interval=settings.general.poll_interval)
    sched.start()

    # Build command to run manage_fastqs.py
    copy_cmd = Command("manage_fastqs.py")
    if hard_links:
        copy_cmd.add_args("--link")
    copy_cmd.add_args(analysis_dir.analysis_dir, project_name)
    if fastq_dir is not None:
        copy_cmd.add_args(fastq_dir)
    copy_cmd.add_args("copy", target_dir)
    print("Running %s" % copy_cmd)
    copy_job = sched.submit(copy_cmd.command_line,
                            name="copy.%s" % job_id,
                            wd=working_dir)

    # Copy README
    if readme_file is not None:
        print("Copying README file")
        copy_cmd = copy_command(readme_file,
                                os.path.join(target_dir, "README"))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.readme" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy download_fastqs.py
    if downloader:
        print("Copying downloader")
        copy_cmd = copy_command(
            downloader, os.path.join(target_dir, os.path.basename(downloader)))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.downloader" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy QC reports
    if qc_zips:
        for qc_zip in qc_zips:
            print("Copying '%s'" % os.path.basename(qc_zip))
            copy_cmd = copy_command(qc_zip,
                                    os.path.join(target_dir,
                                                 os.path.basename(qc_zip)),
                                    link=hard_links)
            sched.submit(copy_cmd.command_line,
                         name="copy.%s.%s" %
                         (job_id, os.path.basename(qc_zip)),
                         runner=SimpleJobRunner(),
                         wd=working_dir)

    # Tar and copy 10xGenomics outputs
    if cellranger_dirs:
        for cellranger_dir in cellranger_dirs:
            print("Tar gzipping and copying '%s'" %
                  os.path.basename(cellranger_dir))
            # Tar & gzip data
            targz = os.path.join(
                working_dir,
                "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir),
                                  project_name, project.info.run))
            targz_cmd = Command("tar", "czvhf", targz, "-C",
                                os.path.dirname(cellranger_dir),
                                os.path.basename(cellranger_dir))
            print("Running %s" % targz_cmd)
            targz_job = sched.submit(
                targz_cmd.command_line,
                name="targz.%s.%s" %
                (job_id, os.path.basename(cellranger_dir)),
                wd=working_dir)
            # Copy the targz file
            copy_cmd = copy_command(
                targz, os.path.join(target_dir, os.path.basename(targz)))
            print("Running %s" % copy_cmd)
            copy_job = sched.submit(copy_cmd.command_line,
                                    name="copytgz.%s.%s" %
                                    (job_id, os.path.basename(cellranger_dir)),
                                    runner=SimpleJobRunner(),
                                    wd=working_dir,
                                    wait_for=(targz_job.job_name, ))

    # Wait for scheduler jobs to complete
    sched.wait()

    # Check exit code for Fastq copying
    exit_code = copy_job.exit_code
    if exit_code != 0:
        logger.error("File copy exited with an error")
        return exit_code
    else:
        print("Files now at %s" % target_dir)
        if weburl:
            url = weburl
            if subdir is not None:
                url = os.path.join(url, subdir)
            print("URL: %s" % url)
        print("Done")
def fastq_strand(argv,working_dir=None):
    """
    Driver for fastq_strand

    Generate strandedness statistics for single FASTQ or
    FASTQ pair, by running STAR using one or more genome
    indexes
    """
    # Process command line
    p = argparse.ArgumentParser(
        description="Generate strandedness statistics "
        "for FASTQ or FASTQpair, by running STAR using "
        "one or more genome indexes",
        version=__version__)
    p.add_argument("r1",metavar="READ1",
                   default=None,
                   help="R1 Fastq file")
    p.add_argument("r2",metavar="READ2",
                   default=None,
                   nargs="?",
                   help="R2 Fastq file")
    p.add_argument("-g","--genome",
                   dest="star_genomedirs",metavar="GENOMEDIR",
                   default=None,
                   action="append",
                   help="path to directory with STAR index "
                   "for genome to use (use as an alternative "
                   "to -c/--conf; can be specified multiple "
                   "times to include additional genomes)")
    p.add_argument("--subset",
                   type=int,
                   default=10000,
                   help="use a random subset of read pairs "
                   "from the input Fastqs; set to zero to "
                   "use all reads (default: 10000)")
    p.add_argument("-o","--outdir",
                   default=None,
                   help="specify directory to write final "
                   "outputs to (default: current directory)")
    p.add_argument("-c","--conf",metavar="FILE",
                   default=None,
                   help="specify delimited 'conf' file with "
                   "list of NAME and STAR index directory "
                   "pairs. NB if a conf file is supplied "
                   "then any indices specifed on the command "
                   "line will be ignored")
    p.add_argument("-n",
                   type=int,
                   default=1,
                   help="number of threads to run STAR with "
                   "(default: 1)")
    p.add_argument("--counts",
                   action="store_true",
                   help="include the count sums for "
                   "unstranded, 1st read strand aligned and "
                   "2nd read strand aligned in the output "
                   "file (default: only include percentages)")
    p.add_argument("--keep-star-output",
                   action="store_true",
                   help="keep the output from STAR (default: "
                   "delete outputs on completion)")
    args = p.parse_args(argv)
    # Print parameters
    print "READ1\t: %s" % args.r1
    print "READ2\t: %s" % args.r2
    # Check that STAR is on the path
    star_exe = find_program("STAR")
    if star_exe is None:
        logging.critical("STAR not found")
        return 1
    print "STAR\t: %s" % star_exe
    # Gather genome indices
    genome_names = {}
    if args.conf is not None:
        print "Conf file\t: %s" % args.conf
        star_genomedirs = []
        with open(args.conf,'r') as fp:
            for line in fp:
                if line.startswith('#'):
                    continue
                name,star_genomedir = line.rstrip().split('\t')
                star_genomedirs.append(star_genomedir)
                # Store an associated name
                genome_names[star_genomedir] = name
    else:
        star_genomedirs = args.star_genomedirs
    if not star_genomedirs:
        logging.critical("No genome indices specified")
        return 1
    print "Genomes:"
    for genome in star_genomedirs:
        print "- %s" % genome
    # Output directory
    if args.outdir is None:
        outdir = os.getcwd()
    else:
        outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        logging.critical("Output directory doesn't exist: %s" %
                         outdir)
        return 1
    # Output file
    outfile = "%s_fastq_strand.txt" % os.path.join(
        outdir,
        os.path.basename(strip_ngs_extensions(args.r1)))
    if os.path.exists(outfile):
        logging.warning("Removing existing output file '%s'" % outfile)
        os.remove(outfile)
    # Prefix for temporary output
    prefix = "fastq_strand_"
    # Working directory
    if working_dir is None:
        working_dir = os.getcwd()
    else:
        working_dir = os.path.abspath(working_dir)
        if not os.path.isdir(working_dir):
            raise Exception("Bad working directory: %s" % working_dir)
    print "Working directory: %s" % working_dir
    # Make subset of input read pairs
    nreads = sum(1 for i in getreads(os.path.abspath(args.r1)))
    print "%d reads" % nreads
    if args.subset == 0:
        print "Using all read pairs in Fastq files"
        subset = nreads
    elif args.subset > nreads:
        print "Actual number of read pairs smaller than requested subset"
        subset = nreads
    else:
        subset = args.subset
        print "Using random subset of %d read pairs" % subset
    if subset == nreads:
        subset_indices = [i for i in xrange(nreads)]
    else:
        subset_indices = random.sample(xrange(nreads),subset)
    fqs_in = filter(lambda fq: fq is not None,(args.r1,args.r2))
    fastqs = []
    for fq in fqs_in:
        fq_subset = os.path.join(working_dir,
                                 os.path.basename(fq))
        if fq_subset.endswith(".gz"):
            fq_subset = '.'.join(fq_subset.split('.')[:-1])
        fq_subset = "%s.subset.fq" % '.'.join(fq_subset.split('.')[:-1])
        with open(fq_subset,'w') as fp:
            for read in getreads_subset(os.path.abspath(fq),
                                        subset_indices):
                fp.write('\n'.join(read) + '\n')
        fastqs.append(fq_subset)
    # Make directory to keep output from STAR
    if args.keep_star_output:
        star_output_dir = os.path.join(outdir,
                                       "STAR.%s.outputs" %
                                       os.path.basename(
                                           strip_ngs_extensions(args.r1)))
        print "Output from STAR will be copied to %s" % star_output_dir
        # Check if directory already exists from earlier run
        if os.path.exists(star_output_dir):
            # Move out of the way
            i = 0
            backup_dir = "%s.bak" % star_output_dir
            while os.path.exists(backup_dir):
                i += 1
                backup_dir = "%s.bak%s" % (star_output_dir,i)
            logging.warning("Moving existing output directory to %s" %
                            backup_dir)
            os.rename(star_output_dir,backup_dir)
        # Make the directory
        os.mkdir(star_output_dir)
    # Write output to a temporary file
    with tempfile.TemporaryFile() as fp:
        # Iterate over genome indices
        for star_genomedir in star_genomedirs:
            # Basename for output for this genome
            try:
                name = genome_names[star_genomedir]
            except KeyError:
                name = star_genomedir
            # Build a command line to run STAR
            star_cmd = [star_exe]
            star_cmd.extend([
                '--runMode','alignReads',
                '--genomeLoad','NoSharedMemory',
                '--genomeDir',os.path.abspath(star_genomedir)])
            star_cmd.extend(['--readFilesIn',
                             fastqs[0]])
            if len(fastqs) > 1:
                star_cmd.append(fastqs[1])
            star_cmd.extend([
                '--quantMode','GeneCounts',
                '--outSAMtype','BAM','Unsorted',
                '--outSAMstrandField','intronMotif',
                '--outFileNamePrefix',prefix,
                '--runThreadN',str(args.n)])
            print "Running %s" % ' '.join(star_cmd)
            try:
                subprocess.check_output(star_cmd,cwd=working_dir)
            except subprocess.CalledProcessError as ex:
                raise Exception("STAR returned non-zero exit code: %s" %
                                ex.returncode)
            # Save the outputs
            if args.keep_star_output:
                # Make a subdirectory for this genome index
                genome_dir = os.path.join(star_output_dir,
                                          name.replace(os.sep,"_"))
                print "Copying STAR outputs to %s" % genome_dir
                os.mkdir(genome_dir)
                for f in os.listdir(working_dir):
                    if f.startswith(prefix):
                        shutil.copy(os.path.join(working_dir,f),
                                    os.path.join(genome_dir,f))
            # Process the STAR output
            star_tab_file = os.path.join(working_dir,
                                         "%sReadsPerGene.out.tab" % prefix)
            if not os.path.exists(star_tab_file):
                raise Exception("Failed to find .out file: %s" % star_tab_file)
            sum_col2 = 0
            sum_col3 = 0
            sum_col4 = 0
            with open(star_tab_file) as out:
                for i,line in enumerate(out):
                    if i < 4:
                        # Skip first four lines
                        continue
                    # Process remaining delimited columns
                    cols = line.rstrip('\n').split('\t')
                    sum_col2 += int(cols[1])
                    sum_col3 += int(cols[2])
                    sum_col4 += int(cols[3])
            print "Sums:"
            print "- col2: %d" % sum_col2
            print "- col3: %d" % sum_col3
            print "- col4: %d" % sum_col4
            if sum_col2 > 0.0:
                forward_1st = float(sum_col3)/float(sum_col2)*100.0
                reverse_2nd = float(sum_col4)/float(sum_col2)*100.0
            else:
                logging.warning("Sum of mapped reads is zero!")
                forward_1st = 0.0
                reverse_2nd = 0.0
            print "Strand percentages:"
            print "- 1st forward: %.2f%%" % forward_1st
            print "- 2nd reverse: %.2f%%" % reverse_2nd
            # Append to output file
            data = [name,
                    "%.2f" % forward_1st,
                    "%.2f" % reverse_2nd]
            if args.counts:
                data.extend([sum_col2,sum_col3,sum_col4])
            fp.write("%s\n" % "\t".join([str(d) for d in data]))
        # Finished iterating over genomes
        # Rewind temporary output file
        fp.seek(0)
        with open(outfile,'w') as out:
            # Header
            out.write("#fastq_strand version: %s\t"
                      "#Aligner: %s\t"
                      "#Reads in subset: %s\n" % (__version__,
                                                  "STAR",
                                                  subset))
            columns = ["Genome","1st forward","2nd reverse"]
            if args.counts:
                columns.extend(["Unstranded",
                                "1st read strand aligned",
                                "2nd read strand aligned"])
            out.write("#%s\n" % "\t".join(columns))
            # Copy content from temp to final file
            for line in fp:
                out.write(line)
    return 0
예제 #11
0
def main():
    # Deal with command line
    p = argparse.ArgumentParser(description="Generate QC report for each "
                                "directory DIR")
    p.add_argument('-v',
                   '--version',
                   action='version',
                   version="%(prog)s " + __version__)
    p.add_argument('--protocol',
                   action='store',
                   dest='qc_protocol',
                   default=None,
                   help="explicitly specify QC protocol (must be one of "
                   "%s). Default is to determine the protocol "
                   "automatically (recommended)" %
                   str(','.join(["'%s'" % pr for pr in PROTOCOLS])))
    p.add_argument('--qc_dir',
                   action='store',
                   dest='qc_dir',
                   default='qc',
                   help="explicitly specify QC output directory (nb if "
                   "supplied then the same QC_DIR will be used for each "
                   "DIR. Non-absolute paths are assumed to be relative to "
                   "DIR). Default: 'qc'")
    p.add_argument('--fastq_dir',
                   action='store',
                   dest='fastq_dir',
                   default=None,
                   help="explicitly specify subdirectory of DIRs with "
                   "Fastq files to run the QC on")
    reporting = p.add_argument_group('Reporting options')
    reporting.add_argument('-t',
                           '--title',
                           action='store',
                           dest='title',
                           default=None,
                           help="title for output QC reports")
    reporting.add_argument('-f',
                           '--filename',
                           action='store',
                           dest='filename',
                           default=None,
                           help="file name for output HTML QC report "
                           "(default: <DIR>/<QC_DIR>_report.html)")
    reporting.add_argument('--zip',
                           action='store_true',
                           dest='zip',
                           default=False,
                           help="make ZIP archive for the QC report")
    reporting.add_argument('--multiqc',
                           action='store_true',
                           dest='multiqc',
                           default=False,
                           help="generate MultiQC report")
    reporting.add_argument('--force',
                           action='store_true',
                           dest='force',
                           default=False,
                           help="force generation of reports even if "
                           "verification fails")
    data_dir_group = reporting.add_mutually_exclusive_group()
    data_dir_group.add_argument('--data-dir',
                                action='store_true',
                                dest='use_data_dir',
                                help="create a data directory with copies "
                                "of QC artefacts needed for the HTML "
                                "report (NB data directory will always "
                                "be created for multi-project reports, "
                                "unless --no-data-dir is specified)")
    data_dir_group.add_argument('--no-data-dir',
                                action='store_true',
                                dest='no_data_dir',
                                help="don't a data directory with copies "
                                "of QC artefacts (this is the default "
                                "except for multi-project reports)")
    verification = p.add_argument_group('Verification options')
    verification.add_argument('--verify',
                              action='store_true',
                              dest='verify',
                              help="verify the QC products only (don't "
                              "write the report); returns exit code 0 "
                              "if QC is verified, 1 if not")
    deprecated = p.add_argument_group('Deprecated options')
    deprecated.add_argument('-l',
                            '--list-unverified',
                            action='store_true',
                            dest='list_unverified',
                            default=False,
                            help="deprecated: does nothing (Fastqs with "
                            "missing QC outputs can no longer be listed)")
    deprecated.add_argument('--strand_stats',
                            action='store_true',
                            dest='fastq_strand',
                            default=False,
                            help="deprecated: does nothing (strand stats "
                            "are automatically included if present)")
    p.add_argument('dirs',
                   metavar="DIR",
                   nargs='+',
                   help="directory to report QC for; can be a project "
                   "directory (in which case the default QC directory "
                   "will be reported), or a QC directory within a "
                   "project")
    args = p.parse_args()

    # Report name and version
    print("%s version %s" % (os.path.basename(sys.argv[0]), __version__))

    # Report arguments
    if sys.argv[1:]:
        print("\n%s" % ' '.join(
            ['"%s"' % arg if ' ' in arg else arg for arg in sys.argv[1:]]))

    # Report working directory
    print("\nCWD %s" % os.getcwd())

    # Check for MultiQC if required
    if args.multiqc:
        if find_program("multiqc") is None:
            logging.critical("MultiQC report requested but 'multiqc' "
                             "not available")
            sys.exit(1)

    # Get projects and QC dirs from supplied directories
    projects = []
    for d in args.dirs:
        print("\n**** Examining directory %s ****" % d)
        # Check if directory is a QC dir
        qc_dir = None
        # Look for 'qc.info' in current directory
        if os.path.exists(os.path.join(os.path.abspath(d), 'qc.info')):
            print("...located 'qc.info', assuming this is QC dir")
            qc_dir = os.path.abspath(d)
            # Locate parent project dir
            metadata_file = locate_project_info_file(qc_dir)
            if metadata_file is not None:
                p = AnalysisProject(os.path.dirname(metadata_file))
                print("...located parent project: %s" % p.dirn)
            else:
                # Unable to locate project directory
                print("...failed to locate parent project metadata file")
                # Fall back to location of Fastq files
                qc_info = AnalysisProjectQCDirInfo(
                    os.path.join(qc_dir, 'qc.info'))
                if qc_info.fastq_dir is not None:
                    project_dir = os.path.abspath(qc_info.fastq_dir)
                    if os.path.basename(project_dir).startswith('fastqs'):
                        # Use the next level up
                        project_dir = os.path.dirname(project_dir)
                    print("...putative parent project dir: %s (from "
                          " Fastq dir)" % project_dir)
                    p = AnalysisProject(project_dir)
                else:
                    # Failed to locate Fastqs
                    logging.fatal("Unable to locate parent project")
                    # Exit with an error
                    sys.exit(1)
            # Issue a warning if a QC dir was explicitly
            # specified on the command line
            if args.qc_dir is not None:
                logging.warning("--qc_dir has been ignored for this "
                                "directory")
        else:
            # Assume directory is a project
            p = AnalysisProject(os.path.abspath(d))
            print("...assuming this is a project dir")
            # Identify the QC directory
            if args.qc_dir is None:
                qc_dir = p.qc_dir
            else:
                qc_dir = args.qc_dir
            if not os.path.isabs(qc_dir):
                qc_dir = os.path.join(p.dirn, qc_dir)
            print("...QC directory: %s" % qc_dir)
        # Explicitly set the QC directory location)
        p.use_qc_dir(qc_dir)
        # Locate the Fastq dir
        qc_info = p.qc_info(qc_dir)
        if args.fastq_dir is None:
            fastq_dir = qc_info.fastq_dir
            if fastq_dir is None:
                fastq_dir = p.fastq_dir
        else:
            fastq_dir = args.fastq_dir
            if qc_info.fastq_dir is not None:
                if os.path.join(p.dirn, qc_info.fastq_dir) != fastq_dir:
                    logging.warning("Stored fastq dir mismatch "
                                    "(%s != %s)" %
                                    (fastq_dir, qc_info.fastq_dir))
        print("...using Fastqs dir: %s" % p.fastq_dir)
        p.use_fastq_dir(fastq_dir, strict=False)
        projects.append(p)

    # Verify QC for projects
    print("\n**** Verifying QC ****")
    retval = 0
    report_projects = []
    for p in projects:
        print("\nProject: %s" % p.name)
        print("-" * (len('Project: ') + len(p.name)))
        print("%d sample%s | %d fastq%s" % (
            len(p.samples),
            's' if len(p.samples) != 1 else '',
            len(p.fastqs),
            's' if len(p.fastqs) != 1 else '',
        ))
        # QC metadata
        qc_dir = p.qc_dir
        qc_info = p.qc_info(qc_dir)
        # Set QC protocol for verification
        if args.qc_protocol is None:
            protocol = qc_info.protocol
            if protocol is None:
                protocol = determine_qc_protocol(p)
        else:
            protocol = args.qc_protocol
        print("Verifying against QC protocol '%s'" % protocol)
        # Verification step
        if len(p.fastqs) == 0:
            logging.critical("No Fastqs!")
            verified = False
        else:
            try:
                verified = verify_project(p, qc_dir, protocol)
            except Exception as ex:
                logging.critical("Error: %s" % ex)
                verified = False
        if not verified:
            print("Verification: FAILED")
            if not args.force:
                retval = 1
                continue
            else:
                print("--force specified, ignoring previous errors")
        else:
            print("Verification: OK")
            if args.verify:
                continue
        report_projects.append(p)

    # Generate QC report
    if report_projects:
        # Set defaults from primary project
        p = report_projects[0]
        qc_base = os.path.basename(p.qc_dir)
        # Filename and location for report
        if args.filename is None:
            out_file = '%s_report.html' % qc_base
        else:
            out_file = args.filename
        if not os.path.isabs(out_file):
            out_file = os.path.join(p.dirn, out_file)
        out_dir = os.path.dirname(out_file)
        # MultiQC report
        if args.multiqc:
            multiqc_report = os.path.join(out_dir,
                                          "multi%s_report.html" % qc_base)
            # Check if we need to rerun MultiQC
            if os.path.exists(multiqc_report) and not args.force:
                run_multiqc = False
                for p in report_projects:
                    multiqc_mtime = os.path.getmtime(multiqc_report)
                    for f in os.listdir(p.qc_dir):
                        if os.path.getmtime(os.path.join(p.qc_dir,f)) > \
                           multiqc_mtime:
                            # Input is newer than report
                            run_multiqc = True
                            break
            else:
                run_multiqc = True
            # (Re)run MultiQC
            if run_multiqc:
                multiqc_cmd = Command('multiqc', '--title', '%s' % args.title,
                                      '--filename', '%s' % multiqc_report,
                                      '--force')
                for p in report_projects:
                    multiqc_cmd.add_args(p.qc_dir)
                print("\nRunning %s" % multiqc_cmd)
                multiqc_retval = multiqc_cmd.run_subprocess()
                if multiqc_retval == 0 and os.path.exists(multiqc_report):
                    print("MultiQC: %s\n" % multiqc_report)
                else:
                    print("MultiQC: FAILED")
                    retval += 1
            else:
                print("MultiQC: %s (already exists)\n" % multiqc_report)
        # Create data directory?
        use_data_dir = (len(projects) > 1)
        if args.use_data_dir:
            use_data_dir = True
        elif args.no_data_dir:
            use_data_dir = False
        # Generate report
        report_html = report(report_projects,
                             title=args.title,
                             filename=out_file,
                             relative_links=True,
                             use_data_dir=use_data_dir,
                             make_zip=args.zip)
        print("Wrote QC report to %s" % out_file)
    # Finish with appropriate exit code
    print("%s completed: exit code %s (%s)" %
          (os.path.basename(sys.argv[0]), retval,
           ('ok' if retval == 0 else 'error')))
    sys.exit(retval)
예제 #12
0
def bcl_to_fastq_10x_chromium_sc_atac(ap,
                                      output_dir,
                                      sample_sheet,
                                      primary_data_dir,
                                      lanes=None,
                                      bases_mask=None,
                                      cellranger_jobmode=None,
                                      cellranger_maxjobs=None,
                                      cellranger_mempercore=None,
                                      cellranger_jobinterval=None,
                                      cellranger_localcores=None,
                                      cellranger_localmem=None,
                                      log_dir=None):
    """
    Generate FASTQ files for 10xGenomics single-cell ATAC-seq run

    Performs FASTQ generation from raw BCL files produced by an
    Illumina sequencer using the 10xGenomics Chromium single-cell
    (sc) ATAC-seq protocol, by running 'cellranger-atac mkfastq'.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      output_dir (str): output directory for bcl-to-fastq conversion
      sample_sheet (str): path to input sample sheet file
      primary_data_dir (str): path to the top-level directory holding
        the sequencing data
      bases_mask (str): if set then use this as an alternative bases
        mask setting (default is to acquire from the autoprocessor
        parameters)
      ...TBD...
    """
    # Load run data
    illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                            platform=ap.metadata.platform)
    # Deal with bases mask
    if bases_mask is None:
        bases_mask = ap.params.bases_mask
    if bases_mask == 'auto':
        # Update bases mask to only use first 8 bases from
        # first index e.g. I8nnnnnnnn and convert second index
        # to read e.g. Y16
        print "Determining bases mask from RunInfo.xml"
        bases_mask = get_bases_mask_10x_atac(illumina_run.runinfo_xml)
        print "Bases mask: %s (updated for 10x scATAC-seq)" % bases_mask
        if not bases_mask_is_valid(bases_mask):
            raise Exception("Invalid bases mask: '%s'" % bases_mask)
    # Check we have cellranger-atac
    cellranger_atac = find_program('cellranger-atac')
    if not cellranger_atac:
        raise Exception("No cellranger package found")
    cellranger_package_info = cellranger_info(cellranger_atac)
    print "Using cellranger-atac %s: %s" % \
        (cellranger_package_info[-1],
         cellranger_atac)
    # Check we have bcl2fastq
    bcl2fastq = find_program('bcl2fastq')
    if not bcl2fastq:
        raise Exception("No bcl2fastq package found")
    bcl2fastq = available_bcl2fastq_versions(
        paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
    if not bcl2fastq:
        raise Exception("No appropriate bcl2fastq software " "located")
    bcl2fastq = bcl2fastq[0]
    bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
    print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1], bcl2fastq)
    # Store info on bcl2fastq package
    ap.metadata['bcl2fastq_software'] = bcl2fastq_info
    # Store info on cellranger package
    ap.metadata['cellranger_software'] = cellranger_package_info
    # Put a copy of sample sheet in the log directory
    shutil.copy(sample_sheet, log_dir)
    # Determine output directory absolute path
    if not os.path.isabs(output_dir):
        output_dir = os.path.join(ap.analysis_dir, output_dir)
    # Working directory (set to analysis dir)
    working_dir = ap.analysis_dir
    # Report values and settings
    print "Cellranger-atac exe    : %s" % cellranger_atac
    print "Cellranger-atac version: %s %s" % (cellranger_package_info[1],
                                              cellranger_package_info[2])
    print "Bcl-to-fastq exe       : %s" % bcl2fastq
    print "Bcl-to-fastq version   : %s %s" % (bcl2fastq_info[1],
                                              bcl2fastq_info[2])
    print "Sample sheet           : %s" % os.path.basename(sample_sheet)
    print "Bases mask             : %s" % bases_mask
    print "Cellranger jobmode     : %s" % cellranger_jobmode
    print "Cellranger maxjobs     : %s" % cellranger_maxjobs
    print "Cellranger mempercore  : %s" % cellranger_mempercore
    print "Cellranger jobinterval : %s" % cellranger_jobinterval
    print "Cellranger localcores  : %s" % cellranger_localcores
    print "Cellranger localmem    : %s" % cellranger_localmem
    print "Working directory      : %s" % working_dir
    print "Log directory          : %s" % log_dir
    # Run cellranger-atac mkfastq
    try:
        return run_cellranger_mkfastq(
            sample_sheet=sample_sheet,
            primary_data_dir=primary_data_dir,
            output_dir=output_dir,
            lanes=(None if lanes is None else ','.join([str(l)
                                                        for l in lanes])),
            bases_mask=bases_mask,
            cellranger_exe=cellranger_atac,
            cellranger_jobmode=cellranger_jobmode,
            cellranger_maxjobs=cellranger_maxjobs,
            cellranger_mempercore=cellranger_mempercore,
            cellranger_jobinterval=cellranger_jobinterval,
            cellranger_localcores=cellranger_localcores,
            cellranger_localmem=cellranger_localmem,
            working_dir=working_dir,
            log_dir=log_dir)
    except Exception as ex:
        raise Exception("'cellranger-atac mkfastq' failed: " "'%s'" % ex)
예제 #13
0
def make_fastqs(ap,
                protocol='standard',
                platform=None,
                unaligned_dir=None,
                sample_sheet=None,
                lanes=None,
                ignore_missing_bcl=False,
                ignore_missing_stats=False,
                skip_rsync=False,
                remove_primary_data=False,
                nprocessors=None,
                require_bcl2fastq_version=None,
                bases_mask=None,
                no_lane_splitting=None,
                minimum_trimmed_read_length=None,
                mask_short_adapter_reads=None,
                generate_stats=True,
                stats_file=None,
                per_lane_stats_file=None,
                analyse_barcodes=True,
                barcode_analysis_dir=None,
                skip_fastq_generation=False,
                only_fetch_primary_data=False,
                create_empty_fastqs=None,
                runner=None,
                cellranger_jobmode=None,
                cellranger_mempercore=None,
                cellranger_maxjobs=None,
                cellranger_jobinterval=None,
                cellranger_localcores=None,
                cellranger_localmem=None,
                cellranger_ignore_dual_index=False):
    """Create and summarise FASTQ files

    Wrapper for operations related to FASTQ file generation and analysis.
    The operations are typically:
 
    - get primary data (BCL files)
    - run bcl-to-fastq conversion
    - generate statistics

    If the number of processors and the job runner are not explicitly
    specified then these are taken from the settings for the bcl2fastq
    and the statistics generation steps, which may differ from each other.
    However if either of these values are set explicitly then the same
    values will be used for both steps.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the analysis
        directory to create Fastqs for
      protocol (str): if set then specifies the protocol to use
        for fastq generation, otherwise use the 'standard' bcl2fastq
        protocol
      platform (str): if set then specifies the sequencing platform
        (otherwise platform will be determined from the primary data)
      unaligned_dir (str): if set then use this as the output directory
        for bcl-to-fastq conversion. Default is 'bcl2fastq' (unless
        an alternative is already specified in the config file)
      sample_sheet (str): if set then use this as the input samplesheet
      lanes (list): (optional) specify a list of lane numbers to
        use in the processing; lanes not in the list will be excluded
        (default is to include all lanes)
      nprocessors (int) : number of processors to run bclToFastq.py with
      ignore_missing_bcl (bool): if True then run bcl2fastq with
        --ignore-missing-bcl
      ignore_missing_stats (bool): if True then run bcl2fastq with
        --ignore-missing-stats
      skip_rsync (bool): if True then don't rsync primary data at the
        start of bcl2fastq conversion
      remove_primary_data (bool): if True then remove primary data at
        the end of bcl2fastq conversion (default is to keep it)
      generate_stats (bool): if True then (re)generate statistics file
        for fastqs
      analyse_barcodes (bool): if True then (re)analyse barcodes for
        fastqs
      require_bcl2fastq_version (str): (optional) specify bcl2fastq
        version to use. Should be a string of the form '1.8.4' or
        '>2.0'. Set to None to automatically determine required
        bcl2fastq version.
      bases_mask (str): if set then use this as an alternative bases
        mask setting
      no_lane_splitting (bool): if True then run bcl2fastq with
        --no-lane-splitting
      minimum_trimmed_read_length (int): if set then specify minimum
        length for reads after adapter trimming (shorter reads will
        be padded with Ns to make them long enough)
      mask_short_adapter_reads (int): if set then specify the minimum
        length of ACGT bases that must be present in a read after
        adapter trimming for it not to be masked completely
        with Ns.
      stats_file (str): if set then use this as the name of the output
        per-fastq stats file.
      per_lane_stats_file (str): if set then use this as the name of
        the output per-lane stats file.
      barcode_analysis_dir (str): if set then specifies path to the
        output directory for barcode analysis
      skip_fastq_generation (bool): if True then don't perform fastq
        generation
      only_fetch_primary_data (bool): if True then fetch primary data,
        don't do anything else
      create_empty_fastqs (bool): if True then create empty 'placeholder'
        fastq files for any missing fastqs after bcl2fastq
        (must have completed with zero exit status)
      runner (JobRunner): (optional) specify a non-default job runner
        to use for fastq generation
      cellranger_jobmode (str): (optional) job mode to run cellranger in
        (10xGenomics Chromium SC data only)
      cellranger_mempercore (int): (optional) memory assumed per core
        (in Gbs) (10xGenomics Chromium SC data only)
      cellranger_maxjobs (int): (optional) maxiumum number of concurrent
         jobs to run (10xGenomics Chromium SC data only)
      cellranger_jobinterval (int): (optional) how often jobs are
         submitted (in ms) (10xGenomics Chromium SC data only)
      cellranger_localcores (int): (optional) maximum number of cores
         cellranger can request in jobmode 'local' (10xGenomics Chromium
         SC data only)
      cellranger_localmem (int): (optional) maximum memory cellranger
         can request in jobmode 'local' (10xGenomics Chromium SC data
         only)
      cellranger_ignore_dual_index (bool): (optional) on a dual-indexed
         flowcell where the second index was not used for the 10x
         sample, ignore it (10xGenomics Chromium SC data only)
    """
    # Report protocol
    print "Protocol              : %s" % protocol
    if protocol not in MAKE_FASTQS_PROTOCOLS:
        raise Exception("Unknown protocol: '%s' (must be one of "
                        "%s)" % (protocol, ','.join([MAKE_FASTQS_PROTOCOLS])))
    # Unaligned dir
    if unaligned_dir is not None:
        ap.params['unaligned_dir'] = unaligned_dir
    elif ap.params['unaligned_dir'] is None:
        ap.params['unaligned_dir'] = 'bcl2fastq'
    print "Output dir            : %s" % ap.params.unaligned_dir
    # Sample sheet
    if sample_sheet is None:
        sample_sheet = ap.params.sample_sheet
    if not os.path.isabs(sample_sheet):
        sample_sheet = os.path.join(ap.analysis_dir, sample_sheet)
    if not os.path.isfile(sample_sheet):
        raise Exception("Missing sample sheet '%s'" % sample_sheet)
    ap.params['sample_sheet'] = sample_sheet
    print "Source sample sheet   : %s" % ap.params.sample_sheet
    # Check requested lanes are actually present
    print "Lanes                 : %s" % ('all' if lanes is None else ','.join(
        [str(l) for l in lanes]))
    if lanes is not None:
        s = IlluminaData.SampleSheet(ap.params.sample_sheet)
        if not s.has_lanes:
            raise Exception("Requested subset of lanes but "
                            "samplesheet doesn't contain any "
                            "lane information")
        samplesheet_lanes = list(set([l['Lane'] for l in s]))
        for l in lanes:
            if l not in samplesheet_lanes:
                raise Exception("Requested lane '%d' not present "
                                "in samplesheet" % l)
    # Make a temporary sample sheet
    if lanes:
        lanes_id = ".L%s" % ''.join([str(l) for l in lanes])
    else:
        lanes_id = ""
    sample_sheet = os.path.join(
        ap.tmp_dir,
        "SampleSheet%s.%s.csv" % (lanes_id, time.strftime("%Y%m%d%H%M%S")))
    make_custom_sample_sheet(ap.params.sample_sheet, sample_sheet, lanes=lanes)
    # Check the temporary sample sheet
    print "Checking temporary sample sheet"
    invalid_barcodes = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_barcodes()
    if invalid_barcodes:
        logger.error("Invalid barcodes detected")
        for line in invalid_barcodes:
            logger.critical("%s" % line)
    invalid_characters = SampleSheetLinter(
        sample_sheet_file=sample_sheet).has_invalid_characters()
    if invalid_characters:
        logger.critical("Invalid non-printing/non-ASCII characters "
                        "detected")
    if invalid_barcodes or invalid_characters:
        raise Exception("Errors detected in generated sample sheet")
    # Adjust verification settings for 10xGenomics Chromium SC
    # data if necessary
    verify_include_sample_dir = False
    if has_chromium_sc_indices(sample_sheet):
        if protocol in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            # Force inclusion of sample-name subdirectories
            # when verifying Chromium SC data
            print "Sample sheet includes Chromium SC indices"
            verify_include_sample_dir = True
        else:
            # Chromium SC indices detected but not using
            # 10x_chromium_sc protocol
            raise Exception("Detected 10xGenomics Chromium SC indices "
                            "in generated sample sheet but protocol "
                            "'%s' has been specified; use an "
                            "appropriate '10x_...' protocol for these "
                            "indices" % protocol)
    # Check for pre-existing Fastq outputs
    if verify_fastq_generation(ap,
                               unaligned_dir=ap.params.unaligned_dir,
                               lanes=lanes,
                               include_sample_dir=verify_include_sample_dir):
        print "Expected Fastq outputs already present"
        skip_rsync = True
        skip_fastq_generation = True
    # Check if there's anything to do
    if (skip_rsync and skip_fastq_generation) and \
       not (generate_stats or analyse_barcodes):
        print "Nothing to do"
        return
    # Log dir
    log_dir = 'make_fastqs'
    if protocol != 'standard':
        log_dir += "_%s" % protocol
    if lanes:
        log_dir += "_L%s" % ''.join([str(l) for l in sorted(lanes)])
    ap.set_log_dir(ap.get_log_subdir(log_dir))
    # Fetch primary data
    if not skip_rsync and not ap.params.acquired_primary_data:
        if get_primary_data(ap) != 0:
            logger.error("Failed to acquire primary data")
            raise Exception("Failed to acquire primary data")
        else:
            ap.params['acquired_primary_data'] = True
    if only_fetch_primary_data:
        return
    # Deal with platform information
    if not platform:
        platform = ap.metadata.platform
    # Do fastq generation using the specified protocol
    if not skip_fastq_generation:
        # Set primary data location and report info
        primary_data_dir = os.path.join(ap.params.primary_data_dir,
                                        os.path.basename(ap.params.data_dir))
        print "Primary data dir      : %s" % primary_data_dir
        try:
            illumina_run = IlluminaData.IlluminaRun(primary_data_dir,
                                                    platform=platform)
        except IlluminaData.IlluminaDataPlatformError as ex:
            logger.critical("Error loading primary data: %s" % ex)
            if platform is None:
                logger.critical("Try specifying platform using --platform?")
            else:
                logger.critical("Check specified platform is valid (or "
                                "omit --platform")
            raise Exception("Error determining sequencer platform")
        print "Platform              : %s" % illumina_run.platform
        print "Bcl format            : %s" % illumina_run.bcl_extension
        # Set platform in metadata
        ap.metadata['platform'] = illumina_run.platform
        # Bases mask
        if bases_mask is not None:
            ap.params['bases_mask'] = bases_mask
        bases_mask = ap.params.bases_mask
        print "Bases mask setting    : %s" % bases_mask
        if protocol not in (
                '10x_chromium_sc',
                '10x_chromium_sc_atac',
        ):
            if bases_mask == "auto":
                print "Determining bases mask from RunInfo.xml"
                bases_mask = get_bases_mask(illumina_run.runinfo_xml,
                                            sample_sheet)
                if not bases_mask_is_valid(bases_mask):
                    raise Exception("Invalid bases mask: '%s'" % bases_mask)
        # Do fastq generation according to protocol
        if protocol == 'icell8':
            # ICell8 data
            # Update bcl2fastq settings appropriately
            print "Updating read trimming and masking for ICell8"
            minimum_trimmed_read_length = 21
            mask_short_adapter_reads = 0
            # Reset the default bases mask
            bases_mask = IlluminaData.IlluminaRunInfo(
                illumina_run.runinfo_xml).bases_mask
            bases_mask = get_icell8_bases_mask(bases_mask,
                                               sample_sheet=sample_sheet)
            if not bases_mask_is_valid(bases_mask):
                raise Exception("Invalid bases mask: '%s'" % bases_mask)
            # Switch to standard protocol
            protocol = 'standard'
        if protocol == 'standard':
            # Standard protocol
            try:
                exit_code = bcl_to_fastq(
                    ap,
                    unaligned_dir=ap.params.unaligned_dir,
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    require_bcl2fastq=require_bcl2fastq_version,
                    bases_mask=bases_mask,
                    ignore_missing_bcl=ignore_missing_bcl,
                    ignore_missing_stats=ignore_missing_stats,
                    no_lane_splitting=no_lane_splitting,
                    minimum_trimmed_read_length=minimum_trimmed_read_length,
                    mask_short_adapter_reads=mask_short_adapter_reads,
                    nprocessors=nprocessors,
                    runner=runner)
            except Exception as ex:
                raise Exception("Bcl2fastq stage failed: '%s'" % ex)
        elif protocol == '10x_chromium_sc':
            # 10xGenomics Chromium SC
            if bases_mask == 'auto':
                bases_mask = None
            try:
                # Check we have cellranger
                cellranger = find_program('cellranger')
                if not cellranger:
                    raise Exception("No cellranger package found")
                cellranger_software_info = cellranger_info(cellranger)
                print "Using cellranger %s: %s" % \
                    (cellranger_software_info[-1],
                     cellranger)
                # Check we have bcl2fastq
                bcl2fastq = find_program('bcl2fastq')
                if not bcl2fastq:
                    raise Exception("No bcl2fastq package found")
                bcl2fastq = available_bcl2fastq_versions(
                    paths=(os.path.dirname(bcl2fastq), ), reqs='>=2.17')
                if not bcl2fastq:
                    raise Exception("No appropriate bcl2fastq software "
                                    "located")
                bcl2fastq = bcl2fastq[0]
                bcl2fastq_info = bcl_to_fastq_info(bcl2fastq)
                print "Using bcl2fastq %s: %s" % (bcl2fastq_info[-1],
                                                  bcl2fastq)
                # Store info on bcl2fastq package
                ap.metadata['bcl2fastq_software'] = bcl2fastq_info
                # Store info on cellranger package
                ap.metadata['cellranger_software'] = cellranger_software_info
                # Put a copy of sample sheet in the log directory
                shutil.copy(sample_sheet, ap.log_dir)
                # Determine output directory absolute path
                output_dir = ap.params.unaligned_dir
                if not os.path.isabs(output_dir):
                    output_dir = os.path.join(ap.analysis_dir, output_dir)
                # Run cellranger mkfastq
                exit_code = run_cellranger_mkfastq(
                    sample_sheet=sample_sheet,
                    primary_data_dir=primary_data_dir,
                    output_dir=output_dir,
                    lanes=(None if lanes is None else ','.join(
                        [str(l) for l in lanes])),
                    bases_mask=bases_mask,
                    cellranger_exe=cellranger,
                    cellranger_jobmode=cellranger_jobmode,
                    cellranger_maxjobs=cellranger_maxjobs,
                    cellranger_mempercore=cellranger_mempercore,
                    cellranger_jobinterval=cellranger_jobinterval,
                    cellranger_localcores=cellranger_localcores,
                    cellranger_localmem=cellranger_localmem,
                    working_dir=ap.analysis_dir,
                    log_dir=ap.log_dir)
            except Exception as ex:
                raise Exception("'cellranger mkfastq' stage failed: "
                                "'%s'" % ex)
            # Turn off barcode analysis
            analyse_barcodes = False
        elif protocol == '10x_chromium_sc_atac':
            # 10xGenomics Chromium scATAC-seq
            exit_code = bcl_to_fastq_10x_chromium_sc_atac(
                ap,
                output_dir=ap.params.unaligned_dir,
                sample_sheet=sample_sheet,
                primary_data_dir=primary_data_dir,
                lanes=lanes,
                bases_mask=bases_mask,
                cellranger_jobmode=cellranger_jobmode,
                cellranger_maxjobs=cellranger_maxjobs,
                cellranger_mempercore=cellranger_mempercore,
                cellranger_jobinterval=cellranger_jobinterval,
                cellranger_localcores=cellranger_localcores,
                cellranger_localmem=cellranger_localmem,
                log_dir=ap.log_dir)
            # Turn off barcode analysis
            analyse_barcodes = False
        else:
            # Unknown protocol
            raise Exception("Unknown protocol '%s'" % protocol)
        # Check the outputs
        if exit_code != 0:
            raise Exception("Fastq generation finished with error: "
                            "exit code %d" % exit_code)
        if not verify_fastq_generation(
                ap, lanes=lanes, include_sample_dir=verify_include_sample_dir):
            # Check failed
            logger.error("Failed to verify output Fastqs against "
                         "sample sheet")
            # Try to load the data from unaligned dir
            try:
                illumina_data = IlluminaData.IlluminaData(
                    ap.analysis_dir, unaligned_dir=ap.params.unaligned_dir)
            except IlluminaData.IlluminaDataError as ex:
                raise Exception("Unable to load data from %s: %s" %
                                (ap.params.unaligned_dir, ex))
            # Generate a list of missing Fastqs
            missing_fastqs = IlluminaData.list_missing_fastqs(
                illumina_data,
                sample_sheet,
                include_sample_dir=verify_include_sample_dir)
            assert (len(missing_fastqs) > 0)
            missing_fastqs_file = os.path.join(ap.log_dir,
                                               "missing_fastqs.log")
            print "Writing list of missing Fastq files to %s" % \
                missing_fastqs_file
            with open(missing_fastqs_file, 'w') as fp:
                for fq in missing_fastqs:
                    fp.write("%s\n" % fq)
            # Create empty FASTQs
            if create_empty_fastqs is None:
                try:
                    create_empty_fastqs = \
                        ap.settings.platform[ap.metadata.platform].\
                        create_empty_fastqs
                except (KeyError, AttributeError):
                    pass
            if create_empty_fastqs is None:
                create_empty_fastqs = \
                    ap.settings.bcl2fastq.create_empty_fastqs
            if create_empty_fastqs:
                logger.warning("Making 'empty' placeholder Fastqs")
                for fq in missing_fastqs:
                    fastq = os.path.join(ap.analysis_dir,
                                         ap.params.unaligned_dir, fq)
                    print "-- %s" % fastq
                    if not os.path.exists(os.path.dirname(fastq)):
                        mkdirs(os.path.dirname(fastq))
                    with gzip.GzipFile(filename=fastq, mode='wb') as fp:
                        fp.write('')
            else:
                raise Exception("Fastq generation failed to produce "
                                "expected outputs")
    # Generate statistics
    if generate_stats:
        fastq_statistics(ap,
                         stats_file=stats_file,
                         per_lane_stats_file=per_lane_stats_file,
                         unaligned_dir=ap.params.unaligned_dir,
                         nprocessors=nprocessors,
                         runner=runner)
    # Run barcode analysis
    if analyse_barcodes:
        # Determine output directory
        if barcode_analysis_dir is not None:
            ap.params['barcode_analysis_dir'] = barcode_analysis_dir
        elif ap.params.barcode_analysis_dir is None:
            ap.params['barcode_analysis_dir'] = 'barcode_analysis'
        barcode_analysis_dir = ap.params.barcode_analysis_dir
        if not os.path.isabs(barcode_analysis_dir):
            barcode_analysis_dir = os.path.join(ap.params.analysis_dir,
                                                barcode_analysis_dir)
        # Report title
        title = "Barcode analysis for %s" % ap.metadata.run_name
        # Log file
        log_file = os.path.join(ap.log_dir, "analyse_barcodes.log")
        # Set up runner
        if runner is None:
            runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        # Get scheduler parameters
        max_jobs = ap.settings.general.max_concurrent_jobs
        poll_interval = ap.settings.general.poll_interval
        # Create and run barcode analysis pipeline
        barcode_analysis = AnalyseBarcodes(
            os.path.join(ap.params.analysis_dir, ap.params.unaligned_dir))
        barcode_analysis.run(barcode_analysis_dir,
                             title=title,
                             lanes=lanes,
                             sample_sheet=sample_sheet,
                             log_file=log_file,
                             runner=runner,
                             max_jobs=max_jobs,
                             poll_interval=poll_interval,
                             verbose=False)
    # Make a 'projects.info' metadata file
    if lanes:
        ap.update_project_metadata_file()
    else:
        ap.make_project_metadata_file()
    # Remove primary data
    if remove_primary_data:
        remove_primary_data(ap)