def info(self): """ Report information about the directory """ # Report information print "Dir : %s" % self._dirn print "Size : %s (%s)" % (utils.format_file_size( self.size), utils.format_file_size(self.size, 'K')) print "Has cache: %s" % print_yes_no(self.has_cache) print "#files: %d" % len(self) print "File types: %s" % print_list(self.extensions) print "Compression types: %s" % print_list(self.compression) print "Users : %s" % print_list(self.users) print "Groups: %s" % print_list(self.groups) print "Oldest: %s %s" % (self.oldest.datetime.ctime(), self.oldest.relpath(self._dirn)) print "Newest: %s %s" % (self.newest.datetime.ctime(), self.newest.relpath(self._dirn)) # Top-level subdirectories print "Top-level subdirectories:" print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms" for subdir in utils.list_dirs(self._dirn): sd = DataDir(os.path.join(self._dirn, subdir), files=self.files(subdir=subdir)) print "- %s/\t%d\t%s\t%s\t%s\t%s" % ( subdir, len(sd), utils.format_file_size( sd.size), print_list(sd.extensions), print_list(sd.users), print_perms(sd.usr_unreadable, sd.grp_unreadable, sd.grp_unwritable)) # File permissions print "File permissions:" print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable) print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable) print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable) print "#Temp files: %d" % len(self.list_temp())
def qc_dirs(self): """ List QC output directories """ qc_dirs = [] for d in bcf_utils.list_dirs(self.dirn): if d.startswith("qc"): qc_dirs.append(d) return qc_dirs
def get_numbered_subdir(name,parent_dir=None,full_path=False): """ Return a name for a new numbered log subdirectory Generates the name for a numbered subdirectory. Subdirectories are named as NNN_<name> e.g. 001_setup, 002_make_fastqs etc. 'Gaps' are ignored, so the number associated with the new name will be one plus the highest index that already exists. **Note that a directory is not created** - this must be done by the calling subprogram. As a result there is the possibility of a race condition. Arguments: name (str): name for the subdirectory (typically the name of the processing stage that will produce logs to be written to the subdirs parent_dir (str): path to the parent directory where the indexed directory would be created; defaults to CWD if not set full_path (bool): if True then return the full path for the new subdirectory; default is to return the name relative to the parent directory Returns: String: name for the new log subdirectory (will be the full path if 'full_path' was specified). """ # Sort out parent directory if parent_dir is None: parent_dir = os.getcwd() parent_dir = os.path.abspath(parent_dir) # Get the highest number from the names of # any other existing numbered subdirs i = 0 for d in bcf_utils.list_dirs(parent_dir): try: i = max(i,int(d.split('_')[0])) except ValueError: pass # Generate and return name/path subdir = "%03d_%s" % (i+1,str(name)) if full_path: subdir = os.path.join(parent_dir,subdir) return subdir
def info(self): """ Report information about the directory """ # Report information print "Dir : %s" % self._dirn print "Size : %s (%s)" % (utils.format_file_size(self.size), utils.format_file_size(self.size,'K')) print "Has cache: %s" % print_yes_no(self.has_cache) print "#files: %d" % len(self) print "File types: %s" % print_list(self.extensions) print "Compression types: %s" % print_list(self.compression) print "Users : %s" % print_list(self.users) print "Groups: %s" % print_list(self.groups) print "Oldest: %s %s" % (self.oldest.datetime.ctime(),self.oldest.relpath(self._dirn)) print "Newest: %s %s" % (self.newest.datetime.ctime(),self.newest.relpath(self._dirn)) # Top-level subdirectories print "Top-level subdirectories:" print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms" for subdir in utils.list_dirs(self._dirn): sd = DataDir(os.path.join(self._dirn,subdir), files=self.files(subdir=subdir)) print "- %s/\t%d\t%s\t%s\t%s\t%s" % (subdir, len(sd), utils.format_file_size(sd.size), print_list(sd.extensions), print_list(sd.users), print_perms(sd.usr_unreadable, sd.grp_unreadable, sd.grp_unwritable)) # File permissions print "File permissions:" print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable) print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable) print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable) print "#Temp files: %d" % len(self.list_temp())
def bcl_to_fastq_info(path=None): """ Retrieve information on the bcl2fastq software If called without any arguments this will locate the first bcl-to-fastq conversion package executable (either 'configureBclToFastq.pl' or 'bcl2fastq') that is available on the user's PATH (as returned by 'available_bcl2fastq_versions') and attempts to guess the package name (either `bcl2fastq` or `CASAVA`) and the version that it belongs to. Alternatively if the path to an executable is supplied then the package name and version will be determined from that instead. If no package is identified then the script path is still returned, but without any version info. Returns: Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH is the full path for the bcl2fastq program or configureBclToFastq.pl script and PACKAGE and VERSION are guesses for the package/version that it belongs to. If any value can't be determined then it will be returned as an empty string. """ # Initialise bcl2fastq_path = '' package_name = '' package_version = '' # Locate the core script if not path: exes = available_bcl2fastq_versions() if exes: bcl2fastq_path = exes[0] else: bcl2fastq_path = os.path.abspath(path) # Identify the version if os.path.basename(bcl2fastq_path) == 'configureBclToFastq.pl': # Found CASAVA or bcl2fastq 1.8.* version # Look for the top-level directory path = os.path.dirname(bcl2fastq_path) # Look for etc directory etc_dir = os.path.join(os.path.dirname(path), 'etc') if os.path.isdir(etc_dir): for d in bcf_utils.list_dirs(etc_dir): m = re.match(r'^(bcl2fastq|CASAVA)-([0-9.]+)$', d) if m: package_name = m.group(1) package_version = m.group(2) break elif os.path.basename(bcl2fastq_path) == 'bcl2fastq': # Found bcl2fastq v2.* # Run the program to get the version version_cmd = applications.Command(bcl2fastq_path, '--version') output = version_cmd.subprocess_check_output()[1] for line in output.split('\n'): if line.startswith('bcl2fastq'): # Extract version from line of the form # bcl2fastq v2.17.1.14 package_name = 'bcl2fastq' try: package_version = line.split()[1][1:] except ex: logging.warning("Unable to get version from '%s': %s" % (line, ex)) else: # No package supplied or located logging.warning("Unable to identify bcl-to-fastq conversion package " "from '%s'" % bcl2fastq_path) # Return what we found return (bcl2fastq_path, package_name, package_version)
"directories holding the top-level analysis directories " "corresponding to different runs. The program reports " "total disk usage for projects assigned to each PI across " "all DIRs.") p.add_option("--pi",action='store',dest="pi_name",default=None, help="List data for PI(s) matching PI_NAME (can use glob-style " "patterns)") p.add_option("--unassigned",action='store_true',dest="unassigned",default=False, help="List data for projects where PI is not assigned") opts,args = p.parse_args() # Collect data audit_data = {} unassigned = [] undetermined = [] for d in args: for dirn in utils.list_dirs(d): dirn = os.path.join(d,dirn) #print "Examining %s" % dirn try: run = AnalysisDir(dirn) for p in run.get_projects(): if p.name == "undetermined": undetermined.append((p,get_size(p.dirn))) continue pi = p.info.PI if pi is None: # PI is not assigned p.info['run'] = os.path.basename(dirn) unassigned.append(p) continue elif opts.pi_name is not None:
def get_analysis_projects_from_dirs(self, pattern=None, strict=False): """ Return a list of AnalysisProjects in the analysis directory Tests each of the subdirectories in the top-level of the analysis directory and rejects any that appear to be CASVAVA/bcl2fastq outputs or which don't successfully load as AnalysisProject instances. Unlike the `get_analysis_projects` method, no checking against the project metadata (typically in 'projects.info') is performed. If the 'pattern' is not None then it should be a simple pattern used to match against available names to select a subset of projects (see bcf_utils.name_matches). Arguments: pattern (str): optional pattern to select a subset of projects (default: select all projects) strict (bool): if True then apply strict checks on each discovered project directory before adding it to the list (default: don't apply strict checks) Returns: List: list of AnalysisProject instances. """ logging.debug("Testing subdirectories to determine analysis projects") projects = [] if pattern is None: pattern = '*' # Try loading each subdirectory as a project for dirn in bcf_utils.list_dirs(self.analysis_dir): # Test for bcl2fastq output try: IlluminaData.IlluminaData(self.analysis_dir, unaligned_dir=dirn) logging.debug("* %s: rejected" % dirn) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logging.debug("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Try loading as a project test_project = AnalysisProject( dirn, os.path.join(self.analysis_dir, dirn)) if strict: # Apply strict checks if not test_project.is_analysis_dir: logging.debug("* %s: rejected (failed strict checks)" % dirn) continue else: # Basic check: are there any samples? if not len(test_project.samples): logging.debug("* %s: rejected (no samples)" % dirn) continue # Passed checks logging.debug("* %s: analysis directory" % dirn) if bcf_utils.name_matches(test_project.name, pattern): projects.append(test_project) return projects
def archive(ap,archive_dir=None,platform=None,year=None, perms=None,group=None,include_bcl2fastq=False, read_only_fastqs=True,runner=None, final=False,force=False,dry_run=False): """ Copy an analysis directory and contents to an archive area Copies the contents of the analysis directory to an archive area, which can be on a local or remote system. The archive directory is constructed in the form <TOP_DIR>/<YEAR>/<PLATFORM>/<DIR>/... The YEAR and PLATFORM can be overriden using the appropriate arguments. By default the data is copied to a 'staging' directory called '__ANALYSIS_DIR.pending' in the archive directory. The archiving can be finalised by setting the 'final' argumente to 'True', which performs a last update of the staging area before moving the data to its final location. Once the archive has been finalised any further archiving attempts will be refused. Copying of the data is performed using 'rsync'; multiple archive operations mirror the contents of the analysis directory (so any data removed from the source will also be removed from the archive). By default the 'bcl2fastq' directory is omitted from the archive, unless the fastq files in any projects are links to the data. Inclusion of this directory can be forced by setting the appropriate argument. The fastqs will be switched to be read-only in the archive by default. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be archived archive_dir (str): top level archive directory, of the form '[[user@]host:]dir' (if not set then use the value from the auto_process.ini file). platform (str): set the value of the <PLATFORM> level in the archive (if not set then taken from the supplied autoprocessor instance). year (str): set the value of the <YEAR> level in the archive (if not set then defaults to the current year) (4 digits) perms (str): change the permissions of the destination files and directories according to the supplied argument (e.g. 'g+w') (if not set then use the value from the auto_process.ini file). group (str): set the group of the destination files to the supplied argument (if not set then use the value from the auto_process.ini file). include_bcl2fastq (bool): if True then force inclusion of the 'bcl2fastq' subdirectory; otherwise only include it if fastq files in project subdirectories are symlinks. read_only_fastqs (bool): if True then make the fastqs read-only in the destination directory; otherwise keep the original permissions. runner: (optional) specify a non-default job runner to use for primary data rsync final (bool): if True then finalize the archive by moving the '.pending' temporary archive to the final location force (bool): if True then do archiving even if there are errors (e.g. key metadata items not set, permission error when setting group etc); otherwise abort archiving operation. dry_run (bool): report what would be done but don't perform any operations. Returns: UNIX-style integer returncode: 0 = successful termination, non-zero indicates an error occurred. """ # Return value retval = 0 # Check if analysis dir is actually staging directory analysis_dir = os.path.basename(ap.analysis_dir) is_staging = False if analysis_dir.startswith("__") and analysis_dir.endswith(".pending"): logger.warning("Operating directly on staged directory") if not final: raise Exception("Cannot re-stage already staged " "analysis directory") else: is_staging = True # Fetch archive location if archive_dir is None: archive_dir = ap.settings.archive.dirn if archive_dir is None: raise Exception("No archive directory specified (use " "--archive_dir option?)") # Construct subdirectory structure i.e. platform and year if platform is None: platform = ap.metadata.platform if platform is None: raise Exception("No platform specified (use --platform " "option?)") if year is None: datestamp = str(ap.metadata.instrument_datestamp) if len(datestamp) == 6: # Assume YYMMDD datestamp format year = "20%s" % datestamp[0:2] elif len(datestamp) == 8: # Assume YYYYMMDD datestamp format year = datestamp[0:4] else: raise Exception("Invalid datestamp '%s' (use " "--year option)" % datestamp) archive_dir = os.path.join(archive_dir,year,platform) if not fileops.exists(archive_dir): raise OSError("Archive directory '%s' doesn't exist" % archive_dir) # Determine target directory if not is_staging: final_dest = analysis_dir staging = "__%s.pending" % analysis_dir else: final_dest = analysis_dir[len("__"):-len(".pending")] staging = analysis_dir if final: dest = final_dest else: dest = staging print("Copying to archive directory: %s" % archive_dir) print("Platform : %s" % platform) print("Year : %s" % year) print("Destination: %s %s" % (dest, "(final)" if final else "(staging)")) # Check if final archive already exists if fileops.exists(os.path.join(archive_dir,final_dest)): raise Exception("Final archive already exists, stopping") # Report available space on target filesystem usage = fileops.disk_usage(archive_dir) print("Available : %s/%s (%s%% in use)" % (format_file_size(usage.free), format_file_size(usage.total), usage.percent)) # Check metadata check_metadata = ap.check_metadata(('source','run_number')) if not check_metadata: if not force or not is_staging: raise Exception("Some metadata items not set, stopping") logger.warning("Some metadata items not set, proceeding") # Locate extra bcl2fastq directories extra_bcl2fastq_dirs = list() for dirn in list_dirs(ap.analysis_dir): if dirn.endswith(".bak") or dirn.startswith("save."): # Ignore continue elif dirn == os.path.basename(ap.params.unaligned_dir): continue # Try to load data from the directory try: illumina_data = IlluminaData(ap.analysis_dir, unaligned_dir=dirn) extra_bcl2fastq_dirs.append(dirn) except Exception: pass if not is_staging: # Are there any projects to archive? try: projects = ap.get_analysis_projects() except Exception as ex: logging.warning("Error trying to fetch analysis projects: " "%s" % ex) projects = [] if not projects: if not force: raise Exception("No project directories found, nothing " "to archive") # Check if there is a bcl2fastq directory instead unaligned_dir = ap.params.unaligned_dir if not os.path.isabs(unaligned_dir): unaligned_dir = os.path.join(ap.analysis_dir, unaligned_dir) if os.path.exists(unaligned_dir): logging.warning("No project directories found, forcing " "archiving of bcl2fastq output directory " "'%s' instead" % ap.params.unaligned_dir) include_bcl2fastq = True else: raise Exception("No project directories or bcl2fastq " "directory output found, nothing to " "archive (even with --force)") # Determine which directories to exclude excludes = ['--exclude=primary_data', '--exclude=save.*', '--exclude=*.bak', '--exclude=*.tmp', '--exclude=tmp.*', '--exclude=__*',] if not include_bcl2fastq: # Determine whether bcl2fastq dir should be included implicitly # because there are links from the analysis directories for project in projects: if project.fastqs_are_symlinks: print("Found at least one project with fastq " "symlinks (%s)" % project.name) include_bcl2fastq = True break if not include_bcl2fastq: print("Excluding '%s' directory from archive" % ap.params.unaligned_dir) excludes.append('--exclude=%s' % ap.params.unaligned_dir) # Exclude extra bcl2fastq dirs for dirn in extra_bcl2fastq_dirs: print("Excluding '%s' directory from archive" % dirn) excludes.append('--exclude=%s' % dirn) # 10xgenomics products to exclude excludes.append('--exclude=*.mro') excludes.append('--exclude=%s*' % tenx_genomics_utils.flow_cell_id(ap.run_name)) # Log dir log_dir = 'archive%s' % ('_final' if final else '_staging') if dry_run: log_dir += '_dry_run' ap.set_log_dir(ap.get_log_subdir(log_dir)) # Set up runner if runner is None: runner = ap.settings.runners.rsync runner.set_log_dir(ap.log_dir) # Setup a scheduler for multiple rsync jobs sched = simple_scheduler.SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs, poll_interval=ap.settings.general.poll_interval) sched.start() # Keep track of jobs archiving_jobs = [] # If making fastqs read-only then transfer them separately if read_only_fastqs and final: # Make sure excluded directories are excluded extra_options = [ex for ex in excludes] # Set up to include only the fastq directories in # projects fastq_dirs = [] for project in projects: for fastq_dir in project.fastq_dirs: fastq_dirs.append(os.path.join( os.path.basename(project.dirn), fastq_dir)) # Update the extra options with includes/excludes extra_options.append('--include=*/') for fastq_dir in fastq_dirs: extra_options.append('--include=%s/**' % fastq_dir) extra_options.append('--exclude=*') # Execute the rsync rsync_fastqs = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=False, mirror=True, dry_run=dry_run, chmod='ugo-w', extra_options=extra_options) print("Running %s" % rsync_fastqs) rsync_fastqs_job = sched.submit(rsync_fastqs, name="rsync.archive_fastqs") # Exclude fastqs from main rsync for fastq_dir in fastq_dirs: excludes.append('--exclude=%s' % fastq_dir) wait_for = [rsync_fastqs_job.job_name] # Add to list of jobs archiving_jobs.append(rsync_fastqs_job) else: # No separate Fastq rsync rsync_fastqs_job = None wait_for = () # Main rsync command rsync = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=True, mirror=True, dry_run=dry_run, chmod=perms, extra_options=excludes) print("Running %s" % rsync) rsync_job = sched.submit(rsync,name="rsync.archive", wait_for=wait_for) archiving_jobs.append(rsync_job) # Wait for jobs to complete rsync_job.wait() # Check exit status on jobs for job in archiving_jobs: print("%s completed: exit code %s" % (job.name, job.exit_code)) retval = sum([j.exit_code for j in archiving_jobs]) if retval != 0: logger.warning("One or more archiving jobs failed " "(non-zero exit code returned)") else: if final: # Update the final stored Fastq paths for QC staged_analysis_dir = os.path.join( archive_dir, staging) archived_analysis_dir = os.path.abspath( os.path.join( archive_dir, final_dest)) for project in AnalysisDir(staged_analysis_dir).get_projects(): qc_info = project.qc_info(project.qc_dir) if qc_info.fastq_dir: print("%s: updating stored Fastq directory for QC" % project.name) new_fastq_dir = os.path.join(archived_analysis_dir, os.path.relpath( qc_info.fastq_dir, ap.analysis_dir)) print("-- updated Fastq directory: %s" % new_fastq_dir) qc_info['fastq_dir'] = new_fastq_dir qc_info.save() # Set the group if group is not None: print("Setting group of archived files to '%s'" % group) if not dry_run: set_group = fileops.set_group_command( group, os.path.join(archive_dir,staging), safe=force, verbose=True) print("Running %s" % set_group) set_group_job = sched.submit( set_group, name="set_group.archive") set_group_job.wait() # Check exit status exit_code = set_group_job.exit_code print("%s completed: exit code %s" % ( set_group_job.name, exit_code)) if exit_code != 0: logger.warning("Setting group failed (non-zero " "exit status code returned)") retval = retval + exit_code # Finish with scheduler sched.wait() sched.stop() # Bail out if there was a problem if retval != 0: if not force: raise Exception("Staging to archive failed") else: logger.warning("Staging to archive failed (ignored)") # Move to final location if final: print("Moving to final location: %s" % final_dest) if not dry_run: fileops.rename(os.path.join(archive_dir,staging), os.path.join(archive_dir,final_dest)) # Report usage of target filesystem usage = fileops.disk_usage(archive_dir) print("Usage of archive: %s available (of %s) (%s%% in use)" % (format_file_size(usage.free), format_file_size(usage.total), usage.percent)) # Finish return retval
def bcl_to_fastq_info(path=None): """ Retrieve information on the bcl2fastq software If called without any arguments this will locate the first bcl-to-fastq conversion package executable (either 'configureBclToFastq.pl' or 'bcl2fastq') that is available on the user's PATH (as returned by 'available_bcl2fastq_versions') and attempts to guess the package name (either `bcl2fastq` or `CASAVA`) and the version that it belongs to. Alternatively if the path to an executable is supplied then the package name and version will be determined from that instead. If no package is identified then the script path is still returned, but without any version info. Returns: Tuple: tuple consisting of (PATH,PACKAGE,VERSION) where PATH is the full path for the bcl2fastq program or configureBclToFastq.pl script and PACKAGE and VERSION are guesses for the package/version that it belongs to. If any value can't be determined then it will be returned as an empty string. """ # Initialise bcl2fastq_path = '' package_name = '' package_version = '' # Locate the core script if not path: exes = available_bcl2fastq_versions() if exes: bcl2fastq_path = exes[0] else: bcl2fastq_path = os.path.abspath(path) # Identify the version if os.path.basename(bcl2fastq_path) == 'configureBclToFastq.pl': # Found CASAVA or bcl2fastq 1.8.* version # Look for the top-level directory path = os.path.dirname(bcl2fastq_path) # Look for etc directory etc_dir = os.path.join(os.path.dirname(path),'etc') if os.path.isdir(etc_dir): for d in bcf_utils.list_dirs(etc_dir): m = re.match(r'^(bcl2fastq|CASAVA)-([0-9.]+)$',d) if m: package_name = m.group(1) package_version = m.group(2) break elif os.path.basename(bcl2fastq_path) == 'bcl2fastq': # Found bcl2fastq v2.* # Run the program to get the version version_cmd = applications.Command(bcl2fastq_path,'--version') output = version_cmd.subprocess_check_output()[1] for line in output.split('\n'): if line.startswith('bcl2fastq'): # Extract version from line of the form # bcl2fastq v2.17.1.14 package_name = 'bcl2fastq' try: package_version = line.split()[1][1:] except ex: logging.warning("Unable to get version from '%s': %s" % (line,ex)) else: # No package supplied or located logging.warning("Unable to identify bcl-to-fastq conversion package " "from '%s'" % bcl2fastq_path) # Return what we found return (bcl2fastq_path,package_name,package_version)
def merge_fastq_dirs(ap, primary_unaligned_dir, output_dir=None, dry_run=False): """ Combine multiple 'unaligned' output directories into one This method combines the output from multiple runs of CASAVA/bcl2fastq into a single 'unaligned'-equivalent directory. Currently it operates in an automatic mode and should detect additional 'unaligned' dirs on its own. Arguments: ap (AutoProcessor): autoprocessor pointing to the parent analysis directory primary_unaligned_dir (str): the 'unaligned' dir that data from from all others will be put into (relative path), unless overridden by 'output_dir' argument output_dir (str): optional, new 'unaligned' dir that will be created to hold merged data (relative path, defaults to 'primary_unaligned_dir') dry_run (boolean): if True then just report operations that would have been performed. """ if primary_unaligned_dir is None: raise Exception("Primary unaligned dir not defined") # Output directory if output_dir is None: output_dir = primary_unaligned_dir print("Fastqs will be merged into '%s'" % output_dir) # Collect unaligned dirs print("Collecting bcl2fastq directories") primary_illumina_data = None unaligned_dirs = {} for dirn in list_dirs(ap.analysis_dir): try: illumina_data = IlluminaData.IlluminaData(ap.analysis_dir, unaligned_dir=dirn) if dirn == primary_unaligned_dir: print("* %s (primary dir)" % dirn) primary_illumina_data = illumina_data elif dirn.endswith(".bak") or dirn.startswith("save."): print("Ignoring %s" % dirn) else: print("* %s" % dirn) unaligned_dirs[dirn] = illumina_data except Exception as ex: logger.debug("Rejecting %s: %s" % (dirn, ex)) # Check primary unaligned dir if primary_illumina_data is None: raise Exception("Primary dir '%s' doesn't exist, or doesn't " "contain data?" % primary_unaligned_dir) # Is there anything to do? if not unaligned_dirs: print("No extra bcl2fastq output directories found, nothing to do") return 0 # Make log directory and set up scheduler (if not dry run) if not dry_run: ap.set_log_dir(ap.get_log_subdir('merge_fastq_dirs')) runner = ap.settings.general.default_runner runner.set_log_dir(ap.log_dir) sched = SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs, poll_interval=ap.settings.general.poll_interval) sched.start() jobs = [] # Top-level for undetermined reads if primary_illumina_data.undetermined.dirn != \ primary_illumina_data.unaligned_dir: undetermined_dir = os.path.basename( primary_illumina_data.undetermined.dirn) else: undetermined_dir = None # Do sanity checks before proceeding print("Checking primary data directory") fmt = primary_illumina_data.format paired_end = primary_illumina_data.paired_end no_lane_splitting = (len(primary_illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) print("* Format: %s" % fmt) print("* no-lane-splitting: %s" % ('yes' if no_lane_splitting else 'no')) print("* paired-end: %s" % ('yes' if paired_end else 'no')) print("* undetermined dir: %s" % undetermined_dir) consistent_data = True for unaligned_dir in unaligned_dirs: illumina_data = unaligned_dirs[unaligned_dir] fmt0 = illumina_data.format no_lane_splitting0 = (len(illumina_data.lanes) == 1) \ and (primary_illumina_data.lanes[0] is None) if (fmt0 != fmt) or (no_lane_splitting0 != no_lane_splitting): print("!!! %s: inconsistent format to primary data dir !!!" % unaligned_dir) consistent_data = False if not consistent_data: raise Exception("Data directories not consistent with primary " "dir '%s'" % primary_unaligned_dir) # Collect the projects from the extra directories projects = [] undetermined = [] for unaligned_dir in unaligned_dirs: print("Examining projects in %s:" % unaligned_dir) illumina_data = unaligned_dirs[unaligned_dir] for project in illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: raise Exception("collision: %s already exists" % project.name) # Deal with undetermined reads if illumina_data.undetermined is not None: print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.append(sample) else: for sample in illumina_data.undetermined.samples: if not list( filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.append(sample) else: raise Exception("collision: %s already exists" % sample.name) else: print("No undetermined samples") # Collect any remaining projects from the primary # unaligned directory print("Examining projects in primary dir %s:" % primary_unaligned_dir) for project in primary_illumina_data.projects: if not list(filter(lambda p: p.name == project.name, projects)): print("- %s: will be merged in" % project.name) projects.append(project) else: print("- %s: already exists, will be discarded" % project.name) # Sort out the undetermined reads print("Examining undetermined samples:") if no_lane_splitting: # No lane info: should merge undetermined fastqs for sample in primary_illumina_data.undetermined.samples: print("- %s: reads will be concatenated" % sample.name) undetermined.insert(0, sample) else: for sample in primary_illumina_data.undetermined.samples: if not list(filter(lambda s: s.name == sample.name, undetermined)): print("- %s: will be merged in" % sample.name) undetermined.insert(0, sample) else: print("- %s: already exists, will be discarded" % sample.name) # Make a new directory for the merging merge_dir = os.path.join(ap.analysis_dir, output_dir + ".new") if undetermined_dir is not None: merge_undetermined_dir = os.path.join(merge_dir, undetermined_dir) else: merge_undetermined_dir = merge_dir if not dry_run: print("Making temporary merge directory %s" % merge_dir) mkdir(merge_dir) if not os.path.exists(merge_undetermined_dir): print("Making directory for undetermined %s" % merge_undetermined_dir) mkdir(merge_undetermined_dir) # Copy the projects print("Importing projects:") for project in projects: print("- %s" % project.name) project_dir = os.path.join(merge_dir, os.path.basename(project.dirn)) cmd = copytree_command(project.dirn, project_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_project.%s" % project.name, wd=merge_dir) print("Job: %s" % job) jobs.append(job) # Handle the undetermined reads print("Dealing with undetermined reads:") if no_lane_splitting: # No lane info: merge undetermined fastqs if len(undetermined) == 1: # Only one undetermined sample - copy Fastqs for read in (1, 2): if read == 2 and not paired_end: break fastqs = sample.fastq_subset(read_number=read, full_path=True) for fq in fastqs: cmd = copy_command(fq, merge_undetermined_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: # Multiple undetermined samples - concat Fastqs for read in (1, 2): if read == 2 and not paired_end: break cmd = Command('concat_fastqs.py') for sample in undetermined: fastqs = sample.fastq_subset(read_number=read, full_path=True) cmd.add_args(*fastqs) cmd.add_args( os.path.join(merge_undetermined_dir, "Undetermined_S0_R%s_001.fastq.gz" % read)) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="merge_undetermined.R%s" % read, wd=merge_dir) print("Job: %s" % job) jobs.append(job) else: for sample in undetermined: print("- %s" % sample.name) if fmt == "bcl2fastq2": # Hardlink copy fastqs directly sample_dir = merge_undetermined_dir if not dry_run: for fq in sample.fastq: src_fq = os.path.join(sample.dirn, fq) dst_fq = os.path.join(sample_dir, fq) os.link(src_fq, dst_fq) else: # Just copy directory tree wholesale sample_dir = os.path.join(merge_undetermined_dir, os.path.basename(sample.dirn)) cmd = copytree_command(sample.dirn, sample_dir) print("- Running %s" % cmd) if not dry_run: job = sched.submit(cmd, name="copy_sample_dir.%s" % sample.name, wd=merge_dir) print("Job: %s" % job.name) jobs.append(job) # Make expected subdirs for bcl2fastq2 if not dry_run and fmt == "bcl2fastq2": for dirn in ('Reports', 'Stats'): mkdir(os.path.join(merge_dir, dirn)) # Add a hidden placeholder to preserve these directories # on rsync -m (prune empty dirs) with open(os.path.join(merge_dir, dirn, '.placeholder'), 'w') as fp: fp.write("") # Wait for scheduler jobs to complete if not dry_run: sched.wait() sched.stop() # Check job exit status exit_status = 0 for j in jobs: exit_status += j.exit_status if j.exit_status != 0: logger.warning("Job failed: %s" % j) if exit_status: logger.critical("One or more jobs failed (non-zero " "exit status)") return exit_status # Move all the 'old' directories out of the way all_unaligned = [u for u in unaligned_dirs] all_unaligned.append(primary_unaligned_dir) for unaligned_dir in all_unaligned: unaligned_backup = os.path.join(ap.analysis_dir, "save.%s" % unaligned_dir) print("Moving %s to %s" % (unaligned_dir, unaligned_backup)) if not dry_run: shutil.move(os.path.join(ap.analysis_dir, unaligned_dir), unaligned_backup) # Rename the merged directory print("Renaming %s to %s" % (merge_dir, output_dir)) if not dry_run: shutil.move(merge_dir, os.path.join(ap.analysis_dir, output_dir)) # Reset the bcl2fastq dir if not dry_run: ap.params['unaligned_dir'] = output_dir # Make a new 'projects.info' metadata file project_metadata_file = os.path.join(ap.analysis_dir, 'projects.info') if os.path.exists(project_metadata_file): print("Moving existing projects.info file out of the way") if not dry_run: os.rename(project_metadata_file, os.path.join(ap.analysis_dir, 'save.projects.info')) print("Creating new projects.info file") if not dry_run: ap.make_project_metadata_file() return 0
def populate(self, fastq_dir=None): """Populate data structure from directory contents """ if not os.path.exists(self.dirn): # Nothing to do, yet return # Get data from info file, if present if os.path.isfile(self.info_file): self.info.load(self.info_file) # Identify possible fastq subdirectories fastq_dirs = [] for d in bcf_utils.list_dirs(self.dirn): fq_dir = os.path.join(self.dirn, d) fastqs = self.find_fastqs(fq_dir) if fastqs: fastq_dirs.append(d) # Also check top-level dir if self.find_fastqs(self.dirn): fastq_dirs.append('.') self.fastq_dirs = fastq_dirs logger.debug("Possible fastq dirs: %s" % ','.join(self.fastq_dirs)) # Set primary fastq file directory if not self.fastq_dirs: logger.debug("No fastq dirs located for %s" % self.dirn) return if self.info.primary_fastq_dir is None: if 'fastqs' in self.fastq_dirs: self.info['primary_fastq_dir'] = 'fastqs' else: self.info['primary_fastq_dir'] = self.fastq_dirs[0] if fastq_dir is None: fastq_dir = self.info.primary_fastq_dir else: if fastq_dir.startswith("%s%s" % (self.dirn, os.sep)): fastq_dir_ = os.path.relpath(fastq_dir, self.dirn) else: fastq_dir_ = fastq_dir if fastq_dir_ not in self.fastq_dirs: logger.warning("Requested fastqs dir '%s' not in list " "of possible dirs %s" % (fastq_dir, ', '.join(self.fastq_dirs))) self.fastq_dir = os.path.normpath(os.path.join(self.dirn, fastq_dir)) # Collect fastq files fastqs = self.find_fastqs(self.fastq_dir) if fastqs: self.fastq_format = self.determine_fastq_format(fastqs[0]) logger.debug("Assigning fastqs to samples...") self.samples = [] for fq in fastqs: name = self.fastq_attrs(fq).sample_name try: sample = self.get_sample(name) except KeyError: sample = AnalysisSample(name, fastq_attrs=self.fastq_attrs) self.samples.append(sample) sample.add_fastq(os.path.normpath(os.path.join(self.fastq_dir, fq))) # Sort samples by name self.samples = sorted(self.samples, key=lambda s: split_sample_name(s.name)) logger.debug("Listing samples and files:") for sample in self.samples: logger.debug("* %s: %s" % (sample.name, sample.fastq)) # Set paired_end flag for project paired_end = True for sample in self.samples: paired_end = (paired_end and sample.paired_end) self.info['paired_end'] = paired_end # Set the QC output dir, if not already set if self.qc_dir is None: self.use_qc_dir('qc')
def __init__(self, analysis_dir): """Create a new AnalysisDir instance for a specified directory Arguments: analysis_dir: name (and path) to analysis directory """ # Store location self._analysis_dir = os.path.abspath(analysis_dir) self._name = os.path.basename(analysis_dir) self._bcl2fastq_dirs = [] self._project_dirs = [] self._extra_dirs = [] self.sequencing_data = [] self.projects = [] self.undetermined = None # Metadata self.metadata = AnalysisDirMetadata() try: metadata_file = os.path.join(self._analysis_dir, "metadata.info") self.metadata.load(metadata_file) except Exception as ex: logger.warning("Failed to load metadata file %s: %s" % (metadata_file, ex)) logger.warning("Attempting to load parameter file") try: params = AnalysisDirParameters() parameter_file = os.path.join(self._analysis_dir, "auto_process.info") params.load(parameter_file, strict=False) # Attempt to acquire values from parameters for param in ('platform', 'run_number', 'source', 'assay'): if param not in params: print "-- %s: missing" % param continue print "-- %s: setting to '%s'" % (param, params[param]) self.metadata[param] = params[param] except Exception as ex: # No parameter file either logger.warning("Failed to load parameters: %s" % ex) logger.warning("Perhaps this is not an auto_process project?") raise ex # Projects metadata try: self.projects_metadata = ProjectMetadataFile( os.path.join(self._analysis_dir, "projects.info")) except Exception as ex: logger.warning("Failed to load projects metadata: %s" % ex) self.projects_metadata = None # Run name try: self.run_name = self.metadata.run except AttributeError: self.run_name = self._analysis_dir[0:-len('_analysis')] self.run_name = os.path.basename(self.run_name) self.date_stamp,\ self.instrument_name,\ self.instrument_run_number = IlluminaData.split_run_name( self.run_name) # Look for outputs from bclToFastq and analysis projects logger.debug("Examining subdirectories of %s" % self._analysis_dir) for dirn in bcf_utils.list_dirs(self._analysis_dir): # Look for sequencing data try: data = IlluminaData.IlluminaData(self._analysis_dir, unaligned_dir=dirn) logger.debug("- %s: sequencing data" % dirn) self._bcl2fastq_dirs.append(dirn) self.sequencing_data.append(data) continue except IlluminaData.IlluminaDataError: pass except Exception as ex: logger.warning("Exception when attempting to load " "subdir '%s' as CASAVA/bcl2fastq output " "(ignored): %s" % (dirn, ex)) # Look for analysis data data = AnalysisProject(dirn, os.path.join(self._analysis_dir, dirn)) if data.is_analysis_dir: if dirn == 'undetermined': logger.debug("- %s: undetermined indexes" % dirn) self.undetermined = data else: # Check against projects.info, if possible try: if not self.projects_metadata.lookup('Project', dirn): logger.debug("- %s: not in projects.info" % dirn) self._extra_dirs.append(dirn) continue except AttributeError: pass logger.debug("- %s: project directory" % dirn) self._project_dirs.append(dirn) self.projects.append(data) continue else: # Unidentified contents self._extra_dirs.append(dirn) logger.debug("- %s: unknown" % dirn)