Exemple #1
0
 def info(self):
     """
     Report information about the directory 
     """
     # Report information
     print "Dir   : %s" % self._dirn
     print "Size  : %s (%s)" % (utils.format_file_size(
         self.size), utils.format_file_size(self.size, 'K'))
     print "Has cache: %s" % print_yes_no(self.has_cache)
     print "#files: %d" % len(self)
     print "File types: %s" % print_list(self.extensions)
     print "Compression types: %s" % print_list(self.compression)
     print "Users : %s" % print_list(self.users)
     print "Groups: %s" % print_list(self.groups)
     print "Oldest: %s %s" % (self.oldest.datetime.ctime(),
                              self.oldest.relpath(self._dirn))
     print "Newest: %s %s" % (self.newest.datetime.ctime(),
                              self.newest.relpath(self._dirn))
     # Top-level subdirectories
     print "Top-level subdirectories:"
     print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms"
     for subdir in utils.list_dirs(self._dirn):
         sd = DataDir(os.path.join(self._dirn, subdir),
                      files=self.files(subdir=subdir))
         print "- %s/\t%d\t%s\t%s\t%s\t%s" % (
             subdir, len(sd), utils.format_file_size(
                 sd.size), print_list(sd.extensions), print_list(sd.users),
             print_perms(sd.usr_unreadable, sd.grp_unreadable,
                         sd.grp_unwritable))
     # File permissions
     print "File permissions:"
     print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable)
     print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable)
     print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable)
     print "#Temp files: %d" % len(self.list_temp())
 def qc_dirs(self):
     """
     List QC output directories
     """
     qc_dirs = []
     for d in bcf_utils.list_dirs(self.dirn):
         if d.startswith("qc"):
             qc_dirs.append(d)
     return qc_dirs
Exemple #3
0
def get_numbered_subdir(name,parent_dir=None,full_path=False):
    """
    Return a name for a new numbered log subdirectory

    Generates the name for a numbered subdirectory.

    Subdirectories are named as NNN_<name>  e.g.
    001_setup, 002_make_fastqs etc.

    'Gaps' are ignored, so the number associated with
    the new name will be one plus the highest index
    that already exists.

    **Note that a directory is not created** - this
    must be done by the calling subprogram. As a
    result there is the possibility of a race
    condition.

    Arguments:
      name (str): name for the subdirectory
        (typically the name of the processing
        stage that will produce logs to be
        written to the subdirs
      parent_dir (str): path to the parent
        directory where the indexed directory
        would be created; defaults to CWD if
        not set
      full_path (bool): if True then return the
        full path for the new subdirectory;
        default is to return the name relative
        to the parent directory

    Returns:
      String: name for the new log subdirectory
        (will be the full path if 'full_path'
        was specified).
    """
    # Sort out parent directory
    if parent_dir is None:
        parent_dir = os.getcwd()
    parent_dir = os.path.abspath(parent_dir)
    # Get the highest number from the names of
    # any other existing numbered subdirs
    i = 0
    for d in bcf_utils.list_dirs(parent_dir):
        try:
            i = max(i,int(d.split('_')[0]))
        except ValueError:
            pass
    # Generate and return name/path
    subdir = "%03d_%s" % (i+1,str(name))
    if full_path:
        subdir = os.path.join(parent_dir,subdir)
    return subdir
Exemple #4
0
 def info(self):
     """
     Report information about the directory 
     """
     # Report information
     print "Dir   : %s" % self._dirn
     print "Size  : %s (%s)" % (utils.format_file_size(self.size),
                                utils.format_file_size(self.size,'K'))
     print "Has cache: %s" % print_yes_no(self.has_cache)
     print "#files: %d" % len(self)
     print "File types: %s" % print_list(self.extensions)
     print "Compression types: %s" % print_list(self.compression)
     print "Users : %s" % print_list(self.users)
     print "Groups: %s" % print_list(self.groups)
     print "Oldest: %s %s" % (self.oldest.datetime.ctime(),self.oldest.relpath(self._dirn))
     print "Newest: %s %s" % (self.newest.datetime.ctime(),self.newest.relpath(self._dirn))
     # Top-level subdirectories
     print "Top-level subdirectories:"
     print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms"
     for subdir in  utils.list_dirs(self._dirn):
         sd = DataDir(os.path.join(self._dirn,subdir),
                      files=self.files(subdir=subdir))
         print "- %s/\t%d\t%s\t%s\t%s\t%s" % (subdir,
                                              len(sd),
                                              utils.format_file_size(sd.size),
                                              print_list(sd.extensions),
                                              print_list(sd.users),
                                              print_perms(sd.usr_unreadable,
                                                          sd.grp_unreadable,
                                                          sd.grp_unwritable))
     # File permissions
     print "File permissions:"
     print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable)
     print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable)
     print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable)
     print "#Temp files: %d" % len(self.list_temp())
Exemple #5
0
def bcl_to_fastq_info(path=None):
    """
    Retrieve information on the bcl2fastq software

    If called without any arguments this will locate the first
    bcl-to-fastq conversion package executable (either
    'configureBclToFastq.pl' or 'bcl2fastq') that is available on
    the user's PATH (as returned by 'available_bcl2fastq_versions')
    and attempts to guess the package name (either `bcl2fastq` or
    `CASAVA`) and the version that it belongs to.

    Alternatively if the path to an executable is supplied then
    the package name and version will be determined from that
    instead.

    If no package is identified then the script path is still
    returned, but without any version info.

    Returns:
      Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH
        is the full path for the bcl2fastq program or
        configureBclToFastq.pl script and PACKAGE and VERSION are
        guesses for the package/version that it belongs to. If any
        value can't be determined then it will be returned as an
        empty string.

    """
    # Initialise
    bcl2fastq_path = ''
    package_name = ''
    package_version = ''
    # Locate the core script
    if not path:
        exes = available_bcl2fastq_versions()
        if exes:
            bcl2fastq_path = exes[0]
    else:
        bcl2fastq_path = os.path.abspath(path)
    # Identify the version
    if os.path.basename(bcl2fastq_path) == 'configureBclToFastq.pl':
        # Found CASAVA or bcl2fastq 1.8.* version
        # Look for the top-level directory
        path = os.path.dirname(bcl2fastq_path)
        # Look for etc directory
        etc_dir = os.path.join(os.path.dirname(path), 'etc')
        if os.path.isdir(etc_dir):
            for d in bcf_utils.list_dirs(etc_dir):
                m = re.match(r'^(bcl2fastq|CASAVA)-([0-9.]+)$', d)
                if m:
                    package_name = m.group(1)
                    package_version = m.group(2)
                    break
    elif os.path.basename(bcl2fastq_path) == 'bcl2fastq':
        # Found bcl2fastq v2.*
        # Run the program to get the version
        version_cmd = applications.Command(bcl2fastq_path, '--version')
        output = version_cmd.subprocess_check_output()[1]
        for line in output.split('\n'):
            if line.startswith('bcl2fastq'):
                # Extract version from line of the form
                # bcl2fastq v2.17.1.14
                package_name = 'bcl2fastq'
                try:
                    package_version = line.split()[1][1:]
                except ex:
                    logging.warning("Unable to get version from '%s': %s" %
                                    (line, ex))
    else:
        # No package supplied or located
        logging.warning("Unable to identify bcl-to-fastq conversion package "
                        "from '%s'" % bcl2fastq_path)
    # Return what we found
    return (bcl2fastq_path, package_name, package_version)
                           "directories holding the top-level analysis directories "
                           "corresponding to different runs. The program reports "
                           "total disk usage for projects assigned to each PI across "
                           "all DIRs.")
 p.add_option("--pi",action='store',dest="pi_name",default=None,
              help="List data for PI(s) matching PI_NAME (can use glob-style "
              "patterns)")
 p.add_option("--unassigned",action='store_true',dest="unassigned",default=False,
              help="List data for projects where PI is not assigned")
 opts,args = p.parse_args()
 # Collect data
 audit_data = {}
 unassigned = []
 undetermined = []
 for d in args:
     for dirn in utils.list_dirs(d):
         dirn = os.path.join(d,dirn)
         #print "Examining %s" % dirn
         try:
             run = AnalysisDir(dirn)
             for p in run.get_projects():
                 if p.name == "undetermined":
                     undetermined.append((p,get_size(p.dirn)))
                     continue
                 pi = p.info.PI
                 if pi is None:
                     # PI is not assigned
                     p.info['run'] = os.path.basename(dirn)
                     unassigned.append(p)
                     continue
                 elif opts.pi_name is not None:
Exemple #7
0
    def get_analysis_projects_from_dirs(self, pattern=None, strict=False):
        """
        Return a list of AnalysisProjects in the analysis directory

        Tests each of the subdirectories in the top-level of the
        analysis directory and rejects any that appear to be
        CASVAVA/bcl2fastq outputs or which don't successfully load
        as AnalysisProject instances.

        Unlike the `get_analysis_projects` method, no checking
        against the project metadata (typically in 'projects.info')
        is performed.

        If the 'pattern' is not None then it should be a simple
        pattern used to match against available names to select
        a subset of projects (see bcf_utils.name_matches).

        Arguments:
          pattern (str): optional pattern to select a subset
            of projects (default: select all projects)
          strict (bool): if True then apply strict checks on
            each discovered project directory before adding it
            to the list (default: don't apply strict checks)

        Returns:
          List: list of AnalysisProject instances.
        """
        logging.debug("Testing subdirectories to determine analysis projects")
        projects = []
        if pattern is None:
            pattern = '*'
        # Try loading each subdirectory as a project
        for dirn in bcf_utils.list_dirs(self.analysis_dir):
            # Test for bcl2fastq output
            try:
                IlluminaData.IlluminaData(self.analysis_dir,
                                          unaligned_dir=dirn)
                logging.debug("* %s: rejected" % dirn)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logging.debug("Exception when attempting to load "
                              "subdir '%s' as CASAVA/bcl2fastq output "
                              "(ignored): %s" % (dirn, ex))
            # Try loading as a project
            test_project = AnalysisProject(
                dirn, os.path.join(self.analysis_dir, dirn))
            if strict:
                # Apply strict checks
                if not test_project.is_analysis_dir:
                    logging.debug("* %s: rejected (failed strict checks)" %
                                  dirn)
                    continue
            else:
                # Basic check: are there any samples?
                if not len(test_project.samples):
                    logging.debug("* %s: rejected (no samples)" % dirn)
                    continue
            # Passed checks
            logging.debug("* %s: analysis directory" % dirn)
            if bcf_utils.name_matches(test_project.name, pattern):
                projects.append(test_project)
        return projects
Exemple #8
0
def archive(ap,archive_dir=None,platform=None,year=None,
            perms=None,group=None,include_bcl2fastq=False,
            read_only_fastqs=True,runner=None,
            final=False,force=False,dry_run=False):
    """
    Copy an analysis directory and contents to an archive area

    Copies the contents of the analysis directory to an archive
    area, which can be on a local or remote system.

    The archive directory is constructed in the form

    <TOP_DIR>/<YEAR>/<PLATFORM>/<DIR>/...

    The YEAR and PLATFORM can be overriden using the appropriate
    arguments.

    By default the data is copied to a 'staging' directory
    called '__ANALYSIS_DIR.pending' in the archive directory.
    The archiving can be finalised by setting the 'final'
    argumente to 'True', which performs a last update of the
    staging area before moving the data to its final location.

    Once the archive has been finalised any further archiving
    attempts will be refused.

    Copying of the data is performed using 'rsync'; multiple
    archive operations mirror the contents of the analysis
    directory (so any data removed from the source will also
    be removed from the archive).

    By default the 'bcl2fastq' directory is omitted from the
    archive, unless the fastq files in any projects are links to
    the data. Inclusion of this directory can be forced by
    setting the appropriate argument.

    The fastqs will be switched to be read-only in the archive
    by default.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be archived
      archive_dir (str): top level archive directory, of the
        form '[[user@]host:]dir' (if not set then use the value
        from the auto_process.ini file).
      platform (str): set the value of the <PLATFORM> level in
        the archive (if not set then taken from the supplied
        autoprocessor instance).
      year (str): set the value of the <YEAR> level in the
        archive (if not set then defaults to the current year)
        (4 digits)
      perms (str): change the permissions of the destination
        files and directories according to the supplied
        argument (e.g. 'g+w') (if not set then use the value
         from the auto_process.ini file).
      group (str): set the group of the destination files to
        the supplied argument (if not set then use the value
        from the auto_process.ini file).
      include_bcl2fastq (bool): if True then force inclusion
        of the 'bcl2fastq' subdirectory; otherwise only include
        it if fastq files in project subdirectories are symlinks.
      read_only_fastqs (bool): if True then make the fastqs
        read-only in the destination directory; otherwise keep
        the original permissions.
      runner: (optional) specify a non-default job runner to use
        for primary data rsync
      final (bool): if True then finalize the archive by
        moving the '.pending' temporary archive to the final
        location
      force (bool): if True then do archiving even if there are
        errors (e.g. key metadata items not set, permission error
        when setting group etc); otherwise abort archiving
        operation.
      dry_run (bool): report what would be done but don't
        perform any operations.

    Returns:
      UNIX-style integer returncode: 0 = successful termination,
        non-zero indicates an error occurred.
    """
    # Return value
    retval = 0
    # Check if analysis dir is actually staging directory
    analysis_dir = os.path.basename(ap.analysis_dir)
    is_staging = False
    if analysis_dir.startswith("__") and analysis_dir.endswith(".pending"):
        logger.warning("Operating directly on staged directory")
        if not final:
            raise Exception("Cannot re-stage already staged "
                            "analysis directory")
        else:
            is_staging = True
    # Fetch archive location
    if archive_dir is None:
        archive_dir = ap.settings.archive.dirn
    if archive_dir is None:
        raise Exception("No archive directory specified (use "
                        "--archive_dir option?)")
    # Construct subdirectory structure i.e. platform and year
    if platform is None:
        platform = ap.metadata.platform
    if platform is None:
        raise Exception("No platform specified (use --platform "
                        "option?)")
    if year is None:
        datestamp = str(ap.metadata.instrument_datestamp)
        if len(datestamp) == 6:
            # Assume YYMMDD datestamp format
            year = "20%s" % datestamp[0:2]
        elif len(datestamp) == 8:
            # Assume YYYYMMDD datestamp format
            year = datestamp[0:4]
        else:
            raise Exception("Invalid datestamp '%s' (use "
                            "--year option)" % datestamp)
    archive_dir = os.path.join(archive_dir,year,platform)
    if not fileops.exists(archive_dir):
        raise OSError("Archive directory '%s' doesn't exist" %
                      archive_dir)
    # Determine target directory
    if not is_staging:
        final_dest = analysis_dir
        staging = "__%s.pending" % analysis_dir
    else:
        final_dest = analysis_dir[len("__"):-len(".pending")]
        staging = analysis_dir
    if final:
        dest = final_dest
    else:
        dest = staging
    print("Copying to archive directory: %s" % archive_dir)
    print("Platform   : %s" % platform)
    print("Year       : %s" % year)
    print("Destination: %s %s" % (dest,
                                  "(final)" if final else
                                  "(staging)"))
    # Check if final archive already exists
    if fileops.exists(os.path.join(archive_dir,final_dest)):
        raise Exception("Final archive already exists, stopping")
    # Report available space on target filesystem
    usage = fileops.disk_usage(archive_dir)
    print("Available  : %s/%s (%s%% in use)" %
          (format_file_size(usage.free),
           format_file_size(usage.total),
           usage.percent))
    # Check metadata
    check_metadata = ap.check_metadata(('source','run_number'))
    if not check_metadata:
        if not force or not is_staging:
            raise Exception("Some metadata items not set, stopping")
        logger.warning("Some metadata items not set, proceeding")
    # Locate extra bcl2fastq directories
    extra_bcl2fastq_dirs = list()
    for dirn in list_dirs(ap.analysis_dir):
        if dirn.endswith(".bak") or dirn.startswith("save."):
            # Ignore
            continue
        elif dirn == os.path.basename(ap.params.unaligned_dir):
            continue
        # Try to load data from the directory
        try:
            illumina_data = IlluminaData(ap.analysis_dir,
                                         unaligned_dir=dirn)
            extra_bcl2fastq_dirs.append(dirn)
        except Exception:
            pass
    if not is_staging:
        # Are there any projects to archive?
        try:
            projects = ap.get_analysis_projects()
        except Exception as ex:
            logging.warning("Error trying to fetch analysis projects: "
                            "%s" % ex)
            projects = []
        if not projects:
            if not force:
                raise Exception("No project directories found, nothing "
                                "to archive")
            # Check if there is a bcl2fastq directory instead
            unaligned_dir = ap.params.unaligned_dir
            if not os.path.isabs(unaligned_dir):
                unaligned_dir = os.path.join(ap.analysis_dir,
                                             unaligned_dir)
            if os.path.exists(unaligned_dir):
                logging.warning("No project directories found, forcing "
                                "archiving of bcl2fastq output directory "
                                "'%s' instead" % ap.params.unaligned_dir)
                include_bcl2fastq = True
            else:
                raise Exception("No project directories or bcl2fastq "
                                "directory output found, nothing to "
                                "archive (even with --force)")
        # Determine which directories to exclude
        excludes = ['--exclude=primary_data',
                    '--exclude=save.*',
                    '--exclude=*.bak',
                    '--exclude=*.tmp',
                    '--exclude=tmp.*',
                    '--exclude=__*',]
        if not include_bcl2fastq:
            # Determine whether bcl2fastq dir should be included implicitly
            # because there are links from the analysis directories
            for project in projects:
                if project.fastqs_are_symlinks:
                    print("Found at least one project with fastq "
                          "symlinks (%s)" % project.name)
                    include_bcl2fastq = True
                    break
        if not include_bcl2fastq:
            print("Excluding '%s' directory from archive" %
                  ap.params.unaligned_dir)
            excludes.append('--exclude=%s' % ap.params.unaligned_dir)
        # Exclude extra bcl2fastq dirs
        for dirn in extra_bcl2fastq_dirs:
            print("Excluding '%s' directory from archive" % dirn)
            excludes.append('--exclude=%s' % dirn)
        # 10xgenomics products to exclude
        excludes.append('--exclude=*.mro')
        excludes.append('--exclude=%s*' %
                        tenx_genomics_utils.flow_cell_id(ap.run_name))
        # Log dir
        log_dir = 'archive%s' % ('_final' if final else '_staging')
        if dry_run:
            log_dir += '_dry_run'
        ap.set_log_dir(ap.get_log_subdir(log_dir))
        # Set up runner
        if runner is None:
            runner = ap.settings.runners.rsync
        runner.set_log_dir(ap.log_dir)
        # Setup a scheduler for multiple rsync jobs
        sched = simple_scheduler.SimpleScheduler(
            runner=runner,
            max_concurrent=ap.settings.general.max_concurrent_jobs,
            poll_interval=ap.settings.general.poll_interval)
        sched.start()
        # Keep track of jobs
        archiving_jobs = []
        # If making fastqs read-only then transfer them separately
        if read_only_fastqs and final:
            # Make sure excluded directories are excluded
            extra_options =  [ex for ex in excludes]
            # Set up to include only the fastq directories in
            # projects
            fastq_dirs = []
            for project in projects:
                for fastq_dir in project.fastq_dirs:
                    fastq_dirs.append(os.path.join(
                        os.path.basename(project.dirn),
                        fastq_dir))
            # Update the extra options with includes/excludes
            extra_options.append('--include=*/')
            for fastq_dir in fastq_dirs:
                extra_options.append('--include=%s/**' % fastq_dir)
            extra_options.append('--exclude=*')
            # Execute the rsync
            rsync_fastqs = applications.general.rsync(
                "%s/" % ap.analysis_dir,
                os.path.join(archive_dir,staging),
                prune_empty_dirs=False,
                mirror=True,
                dry_run=dry_run,
                chmod='ugo-w',
                extra_options=extra_options)
            print("Running %s" % rsync_fastqs)
            rsync_fastqs_job = sched.submit(rsync_fastqs,
                                            name="rsync.archive_fastqs")
            # Exclude fastqs from main rsync
            for fastq_dir in fastq_dirs:
                excludes.append('--exclude=%s' % fastq_dir)
            wait_for = [rsync_fastqs_job.job_name]
            # Add to list of jobs
            archiving_jobs.append(rsync_fastqs_job)
        else:
            # No separate Fastq rsync
            rsync_fastqs_job = None
            wait_for = ()
        # Main rsync command
        rsync = applications.general.rsync(
            "%s/" % ap.analysis_dir,
            os.path.join(archive_dir,staging),
            prune_empty_dirs=True,
            mirror=True,
            dry_run=dry_run,
            chmod=perms,
            extra_options=excludes)
        print("Running %s" % rsync)
        rsync_job = sched.submit(rsync,name="rsync.archive",
                                 wait_for=wait_for)
        archiving_jobs.append(rsync_job)
        # Wait for jobs to complete
        rsync_job.wait()
        # Check exit status on jobs
        for job in archiving_jobs:
            print("%s completed: exit code %s" % (job.name,
                                                  job.exit_code))
        retval = sum([j.exit_code for j in archiving_jobs])
        if retval != 0:
            logger.warning("One or more archiving jobs failed "
                           "(non-zero exit code returned)")
        else:
            if final:
                # Update the final stored Fastq paths for QC
                staged_analysis_dir = os.path.join(
                    archive_dir,
                    staging)
                archived_analysis_dir = os.path.abspath(
                    os.path.join(
                        archive_dir,
                        final_dest))
                for project in AnalysisDir(staged_analysis_dir).get_projects():
                    qc_info = project.qc_info(project.qc_dir)
                    if qc_info.fastq_dir:
                        print("%s: updating stored Fastq directory for QC" %
                              project.name)
                        new_fastq_dir = os.path.join(archived_analysis_dir,
                                                     os.path.relpath(
                                                         qc_info.fastq_dir,
                                                         ap.analysis_dir))
                        print("-- updated Fastq directory: %s" % new_fastq_dir)
                        qc_info['fastq_dir'] = new_fastq_dir
                        qc_info.save()
            # Set the group
            if group is not None:
                print("Setting group of archived files to '%s'" % group)
                if not dry_run:
                    set_group = fileops.set_group_command(
                        group,
                        os.path.join(archive_dir,staging),
                        safe=force,
                        verbose=True)
                    print("Running %s" % set_group)
                    set_group_job = sched.submit(
                        set_group,
                        name="set_group.archive")
                    set_group_job.wait()
                    # Check exit status
                    exit_code = set_group_job.exit_code
                    print("%s completed: exit code %s" % (
                        set_group_job.name,
                        exit_code))
                    if exit_code != 0:
                        logger.warning("Setting group failed (non-zero "
                                       "exit status code returned)")
                    retval = retval + exit_code
        # Finish with scheduler
        sched.wait()
        sched.stop()
        # Bail out if there was a problem
        if retval != 0:
            if not force:
                raise Exception("Staging to archive failed")
            else:
                logger.warning("Staging to archive failed (ignored)")
    # Move to final location
    if final:
        print("Moving to final location: %s" % final_dest)
        if not dry_run:
            fileops.rename(os.path.join(archive_dir,staging),
                           os.path.join(archive_dir,final_dest))
    # Report usage of target filesystem
    usage = fileops.disk_usage(archive_dir)
    print("Usage of archive: %s available (of %s) (%s%% in use)" %
          (format_file_size(usage.free),
           format_file_size(usage.total),
           usage.percent))
    # Finish
    return retval
                           "directories holding the top-level analysis directories "
                           "corresponding to different runs. The program reports "
                           "total disk usage for projects assigned to each PI across "
                           "all DIRs.")
 p.add_option("--pi",action='store',dest="pi_name",default=None,
              help="List data for PI(s) matching PI_NAME (can use glob-style "
              "patterns)")
 p.add_option("--unassigned",action='store_true',dest="unassigned",default=False,
              help="List data for projects where PI is not assigned")
 opts,args = p.parse_args()
 # Collect data
 audit_data = {}
 unassigned = []
 undetermined = []
 for d in args:
     for dirn in utils.list_dirs(d):
         dirn = os.path.join(d,dirn)
         #print "Examining %s" % dirn
         try:
             run = AnalysisDir(dirn)
             for p in run.get_projects():
                 if p.name == "undetermined":
                     undetermined.append((p,get_size(p.dirn)))
                     continue
                 pi = p.info.PI
                 if pi is None:
                     # PI is not assigned
                     p.info['run'] = os.path.basename(dirn)
                     unassigned.append(p)
                     continue
                 elif opts.pi_name is not None:
def bcl_to_fastq_info(path=None):
    """
    Retrieve information on the bcl2fastq software

    If called without any arguments this will locate the first
    bcl-to-fastq conversion package executable (either
    'configureBclToFastq.pl' or 'bcl2fastq') that is available on
    the user's PATH (as returned by 'available_bcl2fastq_versions')
    and attempts to guess the package name (either `bcl2fastq` or
    `CASAVA`) and the version that it belongs to.

    Alternatively if the path to an executable is supplied then
    the package name and version will be determined from that
    instead.

    If no package is identified then the script path is still
    returned, but without any version info.

    Returns:
      Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH
        is the full path for the bcl2fastq program or
        configureBclToFastq.pl script and PACKAGE and VERSION are
        guesses for the package/version that it belongs to. If any
        value can't be determined then it will be returned as an
        empty string.

    """
    # Initialise
    bcl2fastq_path = ''
    package_name = ''
    package_version = ''
    # Locate the core script
    if not path:
        exes = available_bcl2fastq_versions()
        if exes:
            bcl2fastq_path = exes[0]
    else:
        bcl2fastq_path = os.path.abspath(path)
    # Identify the version
    if os.path.basename(bcl2fastq_path) == 'configureBclToFastq.pl':
        # Found CASAVA or bcl2fastq 1.8.* version
        # Look for the top-level directory
        path = os.path.dirname(bcl2fastq_path)
        # Look for etc directory
        etc_dir = os.path.join(os.path.dirname(path),'etc')
        if os.path.isdir(etc_dir):
            for d in bcf_utils.list_dirs(etc_dir):
                m = re.match(r'^(bcl2fastq|CASAVA)-([0-9.]+)$',d)
                if m:
                    package_name = m.group(1)
                    package_version = m.group(2)
                    break
    elif os.path.basename(bcl2fastq_path) == 'bcl2fastq':
        # Found bcl2fastq v2.*
        # Run the program to get the version
        version_cmd = applications.Command(bcl2fastq_path,'--version')
        output = version_cmd.subprocess_check_output()[1]
        for line in output.split('\n'):
            if line.startswith('bcl2fastq'):
                # Extract version from line of the form
                # bcl2fastq v2.17.1.14
                package_name = 'bcl2fastq'
                try:
                    package_version = line.split()[1][1:]
                except ex:
                    logging.warning("Unable to get version from '%s': %s" %
                                    (line,ex))
    else:
        # No package supplied or located
        logging.warning("Unable to identify bcl-to-fastq conversion package "
                        "from '%s'" % bcl2fastq_path)
    # Return what we found
    return (bcl2fastq_path,package_name,package_version)
def merge_fastq_dirs(ap,
                     primary_unaligned_dir,
                     output_dir=None,
                     dry_run=False):
    """
    Combine multiple 'unaligned' output directories into one
    
    This method combines the output from multiple runs of
    CASAVA/bcl2fastq into a single 'unaligned'-equivalent
    directory.

    Currently it operates in an automatic mode and should
    detect additional 'unaligned' dirs on its own.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the parent
        analysis directory
      primary_unaligned_dir (str): the 'unaligned' dir that
        data from from all others will be put into (relative
        path), unless overridden by 'output_dir' argument
      output_dir (str): optional, new 'unaligned' dir that
        will be created to hold merged data (relative path,
        defaults to 'primary_unaligned_dir')
      dry_run (boolean): if True then just report operations
        that would have been performed.
    """
    if primary_unaligned_dir is None:
        raise Exception("Primary unaligned dir not defined")
    # Output directory
    if output_dir is None:
        output_dir = primary_unaligned_dir
    print("Fastqs will be merged into '%s'" % output_dir)
    # Collect unaligned dirs
    print("Collecting bcl2fastq directories")
    primary_illumina_data = None
    unaligned_dirs = {}
    for dirn in list_dirs(ap.analysis_dir):
        try:
            illumina_data = IlluminaData.IlluminaData(ap.analysis_dir,
                                                      unaligned_dir=dirn)
            if dirn == primary_unaligned_dir:
                print("* %s (primary dir)" % dirn)
                primary_illumina_data = illumina_data
            elif dirn.endswith(".bak") or dirn.startswith("save."):
                print("Ignoring %s" % dirn)
            else:
                print("* %s" % dirn)
                unaligned_dirs[dirn] = illumina_data
        except Exception as ex:
            logger.debug("Rejecting %s: %s" % (dirn, ex))
    # Check primary unaligned dir
    if primary_illumina_data is None:
        raise Exception("Primary dir '%s' doesn't exist, or doesn't "
                        "contain data?" % primary_unaligned_dir)
    # Is there anything to do?
    if not unaligned_dirs:
        print("No extra bcl2fastq output directories found, nothing to do")
        return 0
    # Make log directory and set up scheduler (if not dry run)
    if not dry_run:
        ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs'))
        runner = ap.settings.general.default_runner
        runner.set_log_dir(ap.log_dir)
        sched = SimpleScheduler(
            runner=runner,
            max_concurrent=ap.settings.general.max_concurrent_jobs,
            poll_interval=ap.settings.general.poll_interval)
        sched.start()
        jobs = []
    # Top-level for undetermined reads
    if primary_illumina_data.undetermined.dirn != \
       primary_illumina_data.unaligned_dir:
        undetermined_dir = os.path.basename(
            primary_illumina_data.undetermined.dirn)
    else:
        undetermined_dir = None
    # Do sanity checks before proceeding
    print("Checking primary data directory")
    fmt = primary_illumina_data.format
    paired_end = primary_illumina_data.paired_end
    no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \
                        and (primary_illumina_data.lanes[0] is None)
    print("* Format: %s" % fmt)
    print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no'))
    print("* paired-end: %s" % ('yes' if paired_end else 'no'))
    print("* undetermined dir: %s" % undetermined_dir)
    consistent_data = True
    for unaligned_dir in unaligned_dirs:
        illumina_data = unaligned_dirs[unaligned_dir]
        fmt0 = illumina_data.format
        no_lane_splitting0 = (len(illumina_data.lanes) == 1) \
                             and (primary_illumina_data.lanes[0] is None)
        if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting):
            print("!!! %s: inconsistent format to primary data dir !!!" %
                  unaligned_dir)
            consistent_data = False
    if not consistent_data:
        raise Exception("Data directories not consistent with primary "
                        "dir '%s'" % primary_unaligned_dir)
    # Collect the projects from the extra directories
    projects = []
    undetermined = []
    for unaligned_dir in unaligned_dirs:
        print("Examining projects in %s:" % unaligned_dir)
        illumina_data = unaligned_dirs[unaligned_dir]
        for project in illumina_data.projects:
            if not list(filter(lambda p: p.name == project.name, projects)):
                print("- %s: will be merged in" % project.name)
                projects.append(project)
            else:
                raise Exception("collision: %s already exists" % project.name)
        # Deal with undetermined reads
        if illumina_data.undetermined is not None:
            print("Examining undetermined samples:")
            if no_lane_splitting:
                # No lane info: should merge undetermined fastqs
                for sample in illumina_data.undetermined.samples:
                    print("- %s: reads will be concatenated" % sample.name)
                    undetermined.append(sample)
            else:
                for sample in illumina_data.undetermined.samples:
                    if not list(
                            filter(lambda s: s.name == sample.name,
                                   undetermined)):
                        print("- %s: will be merged in" % sample.name)
                        undetermined.append(sample)
                    else:
                        raise Exception("collision: %s already exists" %
                                        sample.name)
        else:
            print("No undetermined samples")
    # Collect any remaining projects from the primary
    # unaligned directory
    print("Examining projects in primary dir %s:" % primary_unaligned_dir)
    for project in primary_illumina_data.projects:
        if not list(filter(lambda p: p.name == project.name, projects)):
            print("- %s: will be merged in" % project.name)
            projects.append(project)
        else:
            print("- %s: already exists, will be discarded" % project.name)
    # Sort out the undetermined reads
    print("Examining undetermined samples:")
    if no_lane_splitting:
        # No lane info: should merge undetermined fastqs
        for sample in primary_illumina_data.undetermined.samples:
            print("- %s: reads will be concatenated" % sample.name)
            undetermined.insert(0, sample)
    else:
        for sample in primary_illumina_data.undetermined.samples:
            if not list(filter(lambda s: s.name == sample.name, undetermined)):
                print("- %s: will be merged in" % sample.name)
                undetermined.insert(0, sample)
            else:
                print("- %s: already exists, will be discarded" % sample.name)
    # Make a new directory for the merging
    merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new")
    if undetermined_dir is not None:
        merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir)
    else:
        merge_undetermined_dir = merge_dir
    if not dry_run:
        print("Making temporary merge directory %s" % merge_dir)
        mkdir(merge_dir)
        if not os.path.exists(merge_undetermined_dir):
            print("Making directory for undetermined %s" %
                  merge_undetermined_dir)
            mkdir(merge_undetermined_dir)
    # Copy the projects
    print("Importing projects:")
    for project in projects:
        print("- %s" % project.name)
        project_dir = os.path.join(merge_dir, os.path.basename(project.dirn))
        cmd = copytree_command(project.dirn, project_dir)
        print("- Running %s" % cmd)
        if not dry_run:
            job = sched.submit(cmd,
                               name="copy_project.%s" % project.name,
                               wd=merge_dir)
            print("Job: %s" % job)
            jobs.append(job)
    # Handle the undetermined reads
    print("Dealing with undetermined reads:")
    if no_lane_splitting:
        # No lane info: merge undetermined fastqs
        if len(undetermined) == 1:
            # Only one undetermined sample - copy Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                fastqs = sample.fastq_subset(read_number=read, full_path=True)
                for fq in fastqs:
                    cmd = copy_command(fq, merge_undetermined_dir)
                    print("- Running %s" % cmd)
                    if not dry_run:
                        job = sched.submit(cmd,
                                           name="copy_undetermined.R%s" % read,
                                           wd=merge_dir)
                        print("Job: %s" % job)
                        jobs.append(job)
        else:
            # Multiple undetermined samples - concat Fastqs
            for read in (1, 2):
                if read == 2 and not paired_end:
                    break
                cmd = Command('concat_fastqs.py')
                for sample in undetermined:
                    fastqs = sample.fastq_subset(read_number=read,
                                                 full_path=True)
                    cmd.add_args(*fastqs)
                cmd.add_args(
                    os.path.join(merge_undetermined_dir,
                                 "Undetermined_S0_R%s_001.fastq.gz" % read))
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="merge_undetermined.R%s" % read,
                                       wd=merge_dir)
                    print("Job: %s" % job)
                    jobs.append(job)
    else:
        for sample in undetermined:
            print("- %s" % sample.name)
            if fmt == "bcl2fastq2":
                # Hardlink copy fastqs directly
                sample_dir = merge_undetermined_dir
                if not dry_run:
                    for fq in sample.fastq:
                        src_fq = os.path.join(sample.dirn, fq)
                        dst_fq = os.path.join(sample_dir, fq)
                        os.link(src_fq, dst_fq)
            else:
                # Just copy directory tree wholesale
                sample_dir = os.path.join(merge_undetermined_dir,
                                          os.path.basename(sample.dirn))
                cmd = copytree_command(sample.dirn, sample_dir)
                print("- Running %s" % cmd)
                if not dry_run:
                    job = sched.submit(cmd,
                                       name="copy_sample_dir.%s" % sample.name,
                                       wd=merge_dir)
                    print("Job: %s" % job.name)
                    jobs.append(job)
    # Make expected subdirs for bcl2fastq2
    if not dry_run and fmt == "bcl2fastq2":
        for dirn in ('Reports', 'Stats'):
            mkdir(os.path.join(merge_dir, dirn))
            # Add a hidden placeholder to preserve these directories
            # on rsync -m (prune empty dirs)
            with open(os.path.join(merge_dir, dirn, '.placeholder'),
                      'w') as fp:
                fp.write("")
    # Wait for scheduler jobs to complete
    if not dry_run:
        sched.wait()
        sched.stop()
        # Check job exit status
        exit_status = 0
        for j in jobs:
            exit_status += j.exit_status
            if j.exit_status != 0:
                logger.warning("Job failed: %s" % j)
        if exit_status:
            logger.critical("One or more jobs failed (non-zero "
                            "exit status)")
            return exit_status
    # Move all the 'old' directories out of the way
    all_unaligned = [u for u in unaligned_dirs]
    all_unaligned.append(primary_unaligned_dir)
    for unaligned_dir in all_unaligned:
        unaligned_backup = os.path.join(ap.analysis_dir,
                                        "save.%s" % unaligned_dir)
        print("Moving %s to %s" % (unaligned_dir, unaligned_backup))
        if not dry_run:
            shutil.move(os.path.join(ap.analysis_dir, unaligned_dir),
                        unaligned_backup)
    # Rename the merged directory
    print("Renaming %s to %s" % (merge_dir, output_dir))
    if not dry_run:
        shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir))
    # Reset the bcl2fastq dir
    if not dry_run:
        ap.params['unaligned_dir'] = output_dir
    # Make a new 'projects.info' metadata file
    project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info')
    if os.path.exists(project_metadata_file):
        print("Moving existing projects.info file out of the way")
        if not dry_run:
            os.rename(project_metadata_file,
                      os.path.join(ap.analysis_dir, 'save.projects.info'))
    print("Creating new projects.info file")
    if not dry_run:
        ap.make_project_metadata_file()
    return 0
Exemple #12
0
    def populate(self, fastq_dir=None):
        """Populate data structure from directory contents

        """
        if not os.path.exists(self.dirn):
            # Nothing to do, yet
            return
        # Get data from info file, if present
        if os.path.isfile(self.info_file):
            self.info.load(self.info_file)
        # Identify possible fastq subdirectories
        fastq_dirs = []
        for d in bcf_utils.list_dirs(self.dirn):
            fq_dir = os.path.join(self.dirn, d)
            fastqs = self.find_fastqs(fq_dir)
            if fastqs:
                fastq_dirs.append(d)
        # Also check top-level dir
        if self.find_fastqs(self.dirn):
            fastq_dirs.append('.')
        self.fastq_dirs = fastq_dirs
        logger.debug("Possible fastq dirs: %s" % ','.join(self.fastq_dirs))
        # Set primary fastq file directory
        if not self.fastq_dirs:
            logger.debug("No fastq dirs located for %s" % self.dirn)
            return
        if self.info.primary_fastq_dir is None:
            if 'fastqs' in self.fastq_dirs:
                self.info['primary_fastq_dir'] = 'fastqs'
            else:
                self.info['primary_fastq_dir'] = self.fastq_dirs[0]
        if fastq_dir is None:
            fastq_dir = self.info.primary_fastq_dir
        else:
            if fastq_dir.startswith("%s%s" % (self.dirn, os.sep)):
                fastq_dir_ = os.path.relpath(fastq_dir, self.dirn)
            else:
                fastq_dir_ = fastq_dir
            if fastq_dir_ not in self.fastq_dirs:
                logger.warning("Requested fastqs dir '%s' not in list "
                               "of possible dirs %s" %
                               (fastq_dir, ', '.join(self.fastq_dirs)))
        self.fastq_dir = os.path.normpath(os.path.join(self.dirn, fastq_dir))
        # Collect fastq files
        fastqs = self.find_fastqs(self.fastq_dir)
        if fastqs:
            self.fastq_format = self.determine_fastq_format(fastqs[0])
        logger.debug("Assigning fastqs to samples...")
        self.samples = []
        for fq in fastqs:
            name = self.fastq_attrs(fq).sample_name
            try:
                sample = self.get_sample(name)
            except KeyError:
                sample = AnalysisSample(name, fastq_attrs=self.fastq_attrs)
                self.samples.append(sample)
            sample.add_fastq(os.path.normpath(os.path.join(self.fastq_dir,
                                                           fq)))
        # Sort samples by name
        self.samples = sorted(self.samples,
                              key=lambda s: split_sample_name(s.name))
        logger.debug("Listing samples and files:")
        for sample in self.samples:
            logger.debug("* %s: %s" % (sample.name, sample.fastq))
        # Set paired_end flag for project
        paired_end = True
        for sample in self.samples:
            paired_end = (paired_end and sample.paired_end)
        self.info['paired_end'] = paired_end
        # Set the QC output dir, if not already set
        if self.qc_dir is None:
            self.use_qc_dir('qc')
Exemple #13
0
    def __init__(self, analysis_dir):
        """Create a new AnalysisDir instance for a specified directory

        Arguments:
          analysis_dir: name (and path) to analysis directory

        """
        # Store location
        self._analysis_dir = os.path.abspath(analysis_dir)
        self._name = os.path.basename(analysis_dir)
        self._bcl2fastq_dirs = []
        self._project_dirs = []
        self._extra_dirs = []
        self.sequencing_data = []
        self.projects = []
        self.undetermined = None
        # Metadata
        self.metadata = AnalysisDirMetadata()
        try:
            metadata_file = os.path.join(self._analysis_dir, "metadata.info")
            self.metadata.load(metadata_file)
        except Exception as ex:
            logger.warning("Failed to load metadata file %s: %s" %
                           (metadata_file, ex))
            logger.warning("Attempting to load parameter file")
            try:
                params = AnalysisDirParameters()
                parameter_file = os.path.join(self._analysis_dir,
                                              "auto_process.info")
                params.load(parameter_file, strict=False)
                # Attempt to acquire values from parameters
                for param in ('platform', 'run_number', 'source', 'assay'):
                    if param not in params:
                        print "-- %s: missing" % param
                        continue
                    print "-- %s: setting to '%s'" % (param, params[param])
                    self.metadata[param] = params[param]
            except Exception as ex:
                # No parameter file either
                logger.warning("Failed to load parameters: %s" % ex)
                logger.warning("Perhaps this is not an auto_process project?")
                raise ex
        # Projects metadata
        try:
            self.projects_metadata = ProjectMetadataFile(
                os.path.join(self._analysis_dir, "projects.info"))
        except Exception as ex:
            logger.warning("Failed to load projects metadata: %s" % ex)
            self.projects_metadata = None
        # Run name
        try:
            self.run_name = self.metadata.run
        except AttributeError:
            self.run_name = self._analysis_dir[0:-len('_analysis')]
        self.run_name = os.path.basename(self.run_name)
        self.date_stamp,\
            self.instrument_name,\
            self.instrument_run_number = IlluminaData.split_run_name(
                self.run_name)
        # Look for outputs from bclToFastq and analysis projects
        logger.debug("Examining subdirectories of %s" % self._analysis_dir)
        for dirn in bcf_utils.list_dirs(self._analysis_dir):
            # Look for sequencing data
            try:
                data = IlluminaData.IlluminaData(self._analysis_dir,
                                                 unaligned_dir=dirn)
                logger.debug("- %s: sequencing data" % dirn)
                self._bcl2fastq_dirs.append(dirn)
                self.sequencing_data.append(data)
                continue
            except IlluminaData.IlluminaDataError:
                pass
            except Exception as ex:
                logger.warning("Exception when attempting to load "
                               "subdir '%s' as CASAVA/bcl2fastq output "
                               "(ignored): %s" % (dirn, ex))
            # Look for analysis data
            data = AnalysisProject(dirn, os.path.join(self._analysis_dir,
                                                      dirn))
            if data.is_analysis_dir:
                if dirn == 'undetermined':
                    logger.debug("- %s: undetermined indexes" % dirn)
                    self.undetermined = data
                else:
                    # Check against projects.info, if possible
                    try:
                        if not self.projects_metadata.lookup('Project', dirn):
                            logger.debug("- %s: not in projects.info" % dirn)
                            self._extra_dirs.append(dirn)
                            continue
                    except AttributeError:
                        pass
                    logger.debug("- %s: project directory" % dirn)
                    self._project_dirs.append(dirn)
                    self.projects.append(data)
                continue
            else:
                # Unidentified contents
                self._extra_dirs.append(dirn)
                logger.debug("- %s: unknown" % dirn)