if modulefiles is not None: for modulefile in modulefiles.split(','): envmod.load(modulefile) # Deal with job runners runners = dict() for runner in args.runners: try: stage, runner_spec = runner.split('=') except ValueError: # too few values to unpack stage = 'default' runner_spec = runner if stage not in stages: logger.fatal("Bad stage for --runner option: %s" % stage) sys.exit(1) runners[stage] = fetch_runner(runner_spec) try: default_runner = runners['default'] except KeyError: default_runner = __settings.runners.icell8 for stage in stages: if stage not in runners: if stage == 'qc': # Use the general QC settings stage_runner = __settings.runners['qc'] else: # Look for Icell8-specific runner try: stage_runner = __settings.runners['icell8_%s' % stage] except KeyError: stage_runner = default_runner
# Set number of threads for QC jobs if args.nthreads: nthreads = args.nthreads else: nthreads = __settings.qc.nprocessors # Cellranger settings cellranger_jobmode = cellranger_settings.cellranger_jobmode cellranger_mempercore = cellranger_settings.cellranger_mempercore cellranger_jobinterval = cellranger_settings.cellranger_jobinterval cellranger_localcores = cellranger_settings.cellranger_localcores cellranger_localmem = cellranger_settings.cellranger_localmem # Set up runners if args.runner is not None: # Runner explicitly supplied on the command line print("Setting up runners supplied on command line") default_runner = fetch_runner(args.runner) runners = { 'cellranger_runner': default_runner, 'fastqc_runner': default_runner, 'fastq_screen_runner': default_runner, 'star_runner': default_runner, 'verify_runner': default_runner, 'report_runner': default_runner, } else: # Runners from configuration print("Setting up runners from configuration") default_runner = __settings.general.default_runner runners = { 'cellranger_runner': __settings.runners.cellranger, 'fastqc_runner': __settings.runners.fastqc,
# Parse the command line args = p.parse_args() # Set up environment if args.modulefiles is None: modulefiles = __modulefiles else: modulefiles = args.modulefiles if modulefiles is not None: announce("Setting up environment") for modulefile in modulefiles.split(','): envmod.load(modulefile) # Job runner qc_runner = fetch_runner(args.runner) # Load the project announce("Loading project data") project_dir = os.path.abspath(args.project_dir) project_name = os.path.basename(project_dir) project = AnalysisProject(project_name,project_dir) # Get list of samples project = AnalysisProject(project_name,project_dir, fastq_dir=args.fastq_dir) print "Subdirectories with Fastqs:" for fastq_dir in project.fastq_dirs: print "- %s" % fastq_dir print "Gathering Fastqs from %s" % project.fastq_dir if args.sample_pattern is not None:
def archive(ap,archive_dir=None,platform=None,year=None, perms=None,group=None,include_bcl2fastq=False, read_only_fastqs=True,runner=None, final=False,force=False,dry_run=False): """ Copy an analysis directory and contents to an archive area Copies the contents of the analysis directory to an archive area, which can be on a local or remote system. The archive directory is constructed in the form <TOP_DIR>/<YEAR>/<PLATFORM>/<DIR>/... The YEAR and PLATFORM can be overriden using the appropriate arguments. By default the data is copied to a 'staging' directory called '__ANALYSIS_DIR.pending' in the archive directory. The archiving can be finalised by setting the 'final' argumente to 'True', which performs a last update of the staging area before moving the data to its final location. Once the archive has been finalised any further archiving attempts will be refused. Copying of the data is performed using 'rsync'; multiple archive operations mirror the contents of the analysis directory (so any data removed from the source will also be removed from the archive). By default the 'bcl2fastq' directory is omitted from the archive, unless the fastq files in any projects are links to the data. Inclusion of this directory can be forced by setting the appropriate argument. The fastqs will be switched to be read-only in the archive by default. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be archived archive_dir (str): top level archive directory, of the form '[[user@]host:]dir' (if not set then use the value from the settings.ini file). platform (str): set the value of the <PLATFORM> level in the archive (if not set then taken from the supplied autoprocessor instance). year (str): set the value of the <YEAR> level in the archive (if not set then defaults to the current year) (4 digits) perms (str): change the permissions of the destination files and directories according to the supplied argument (e.g. 'g+w') (if not set then use the value from the settings.ini file). group (str): set the group of the destination files to the supplied argument (if not set then use the value from the settings.ini file). include_bcl2fastq (bool): if True then force inclusion of the 'bcl2fastq' subdirectory; otherwise only include it if fastq files in project subdirectories are symlinks. read_only_fastqs (bool): if True then make the fastqs read-only in the destination directory; otherwise keep the original permissions. runner: (optional) specify a non-default job runner to use for primary data rsync final (bool): if True then finalize the archive by moving the '.pending' temporary archive to the final location force (bool): if True then do archiving even if key metadata items are not set; otherwise abort archiving operation. dry_run (bool): report what would be done but don't perform any operations. Returns: UNIX-style integer returncode: 0 = successful termination, non-zero indicates an error occurred. """ # Return value retval = 0 # Check if analysis dir is actually staging directory analysis_dir = os.path.basename(ap.analysis_dir) is_staging = False if analysis_dir.startswith("__") and analysis_dir.endswith(".pending"): logger.warning("Operating directly on staged directory") if not final: raise Exception("Cannot re-stage already staged " "analysis directory") else: is_staging = True # Fetch archive location if archive_dir is None: archive_dir = ap.settings.archive.dirn if archive_dir is None: raise Exception("No archive directory specified (use " "--archive_dir option?)") # Construct subdirectory structure i.e. platform and year if platform is None: platform = ap.metadata.platform if platform is None: raise Exception("No platform specified (use --platform " "option?)") if year is None: year = "20%s" % str(ap.metadata.instrument_datestamp)[0:2] archive_dir = os.path.join(archive_dir,year,platform) if not fileops.exists(archive_dir): raise OSError("Archive directory '%s' doesn't exist" % archive_dir) # Determine target directory if not is_staging: final_dest = analysis_dir staging = "__%s.pending" % analysis_dir else: final_dest = analysis_dir[len("__"):-len(".pending")] staging = analysis_dir if final: dest = final_dest else: dest = staging print "Copying to archive directory: %s" % archive_dir print "Platform : %s" % platform print "Year : %s" % year print "Destination: %s %s" % (dest, "(final)" if final else "(staging)") # Check if final archive already exists if fileops.exists(os.path.join(archive_dir,final_dest)): logging.fatal("Final archive already exists, stopping") return 1 # Check metadata check_metadata = ap.check_metadata(('source','run_number')) if not check_metadata: if not force or not is_staging: logging.fatal("Some metadata items not set, stopping") return 1 logging.warning("Some metadata items not set, proceeding") if not is_staging: # Are there any projects to archive? projects = ap.get_analysis_projects() if not projects: raise Exception("No project directories found, nothing " "to archive") # Determine which directories to exclude excludes = ['--exclude=primary_data', '--exclude=save.*', '--exclude=*.bak', '--exclude=tmp.*'] if not include_bcl2fastq: # Determine whether bcl2fastq dir should be included implicitly # because there are links from the analysis directories for project in projects: if project.fastqs_are_symlinks: print "Found at least one project with fastq " \ "symlinks (%s)" % project.name include_bcl2fastq = True break if not include_bcl2fastq: print "Excluding '%s' directory from archive" % \ ap.params.unaligned_dir excludes.append('--exclude=%s' % ap.params.unaligned_dir) # 10xgenomics products to exclude excludes.append('--exclude=*.mro') excludes.append('--exclude="%s*"' % tenx_genomics_utils.flow_cell_id(ap.run_name)) # Log dir log_dir = 'archive%s' % ('_final' if final else '_staging') if dry_run: log_dir += '_dry_run' ap.set_log_dir(ap.get_log_subdir(log_dir)) # Set up runner if runner is not None: runner = fetch_runner(runner) else: runner = ap.settings.runners.rsync runner.set_log_dir(ap.log_dir) # Setup a scheduler for multiple rsync jobs sched = simple_scheduler.SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs) sched.start() # Keep track of jobs archiving_jobs = [] # If making fastqs read-only then transfer them separately if read_only_fastqs and final: rsync_fastqs = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=True, dry_run=dry_run, chmod='ugo-w', extra_options=( '--include=*/', '--include=fastqs/**', '--exclude=*',)) print "Running %s" % rsync_fastqs rsync_fastqs_job = sched.submit(rsync_fastqs, name="rsync.archive_fastqs") # Exclude fastqs from main rsync excludes.append('--exclude=fastqs') wait_for = [rsync_fastqs_job.job_name] # Add to list of jobs archiving_jobs.append(rsync_fastqs_job) else: # No separate Fastq rsync rsync_fastqs_job = None wait_for = () # Main rsync command rsync = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=True, mirror=True, dry_run=dry_run, chmod=perms, extra_options=excludes) print "Running %s" % rsync rsync_job = sched.submit(rsync,name="rsync.archive", wait_for=wait_for) archiving_jobs.append(rsync_job) # Wait for jobs to complete rsync_job.wait() # Check exit status on jobs for job in archiving_jobs: print "%s completed: exit code %s" % (job.name, job.exit_code) retval = sum([j.exit_code for j in archiving_jobs]) if retval != 0: logger.warning("One or more archiving jobs failed " "(non-zero exit code returned)") else: # Set the group if group is not None: print "Setting group of archived files to '%s'" % group if not dry_run: set_group = fileops.set_group_command( group, os.path.join(archive_dir,staging), verbose=True) print "Running %s" % set_group set_group_job = sched.submit( set_group, name="set_group.archive") set_group_job.wait() # Check exit status exit_code = set_group_job.exit_code print "%s completed: exit code %s" % ( set_group_job.name, exit_code) if exit_code != 0: logger.warning("Setting group failed (non-zero " "exit status code returned)") retval = retval + exit_code # Finish with scheduler sched.wait() sched.stop() # Bail out if there was a problem if retval != 0: raise Exception("Staging to archive failed") # Move to final location if final: print "Moving to final location: %s" % final_dest if not dry_run: fileops.rename(os.path.join(archive_dir,staging), os.path.join(archive_dir,final_dest)) # Finish return retval
def main(): """ """ # Load configuration settings = Settings() # Collect defaults default_runner = settings.runners.rsync # Get pre-defined destinations destinations = [name for name in settings.destination] # Command line p = argparse.ArgumentParser( description="Transfer copies of Fastq data from an analysis " "project to an arbitrary destination for sharing with other " "people") p.add_argument('--version', action='version', version=("%%(prog)s %s" % get_version())) p.add_argument('--subdir', action='store', choices=('random_bin', 'run_id'), default=None, help="subdirectory naming scheme: 'random_bin' " "locates a random pre-existing empty subdirectory " "under the target directory; 'run_id' creates a " "new subdirectory " "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this " "option is not set then no subdirectory will be " "used") p.add_argument('--readme', action='store', metavar='README_TEMPLATE', dest='readme_template', help="template file to generate README file from; " "can be full path to a template file, or the name " "of a file in the 'templates' directory") p.add_argument('--weburl', action='store', help="base URL for webserver (sets the value of " "the WEBURL variable in the template README)") p.add_argument('--include_downloader', action='store_true', help="copy the 'download_fastqs.py' utility to the " "final location") p.add_argument('--include_qc_report', action='store_true', help="copy the zipped QC reports to the final " "location") p.add_argument('--include_10x_outputs', action='store_true', help="copy outputs from 10xGenomics pipelines (e.g. " "'cellranger count') to the final location") p.add_argument('--link', action='store_true', help="hard link files instead of copying") p.add_argument('--runner', action='store', help="specify the job runner to use for executing " "the checksumming, Fastq copy and tar gzipping " "operations (defaults to job runner defined for " "copying in config file [%s])" % default_runner) p.add_argument('dest', action='store', metavar="DEST", help="destination to copy Fastqs to; can be the " "name of a destination defined in the configuration " "file, or an arbitrary location of the form " "'[[USER@]HOST:]DIR' (%s)" % (("available destinations: %s" % (','.join("'%s'" % d for d in sorted(destinations)))) if destinations else "no destinations currently defined")) p.add_argument('project', action='store', metavar="PROJECT", help="path to project directory (or to a Fastqs " "subdirectory in a project) to copy Fastqs from") # Process command line args = p.parse_args() # Check if target is pre-defined destination if args.dest in destinations: print("Loading settings for destination '%s'" % args.dest) dest = settings.destination[args.dest] target_dir = dest.directory readme_template = dest.readme_template subdir = dest.subdir include_downloader = dest.include_downloader include_qc_report = dest.include_qc_report hard_links = dest.hard_links weburl = dest.url else: target_dir = args.dest readme_template = None subdir = None include_downloader = False include_qc_report = False hard_links = False weburl = None # Update defaults with command line values if args.readme_template: readme_template = args.readme_template if args.subdir: subdir = args.subdir if args.include_downloader: include_downloader = True if args.include_qc_report: include_qc_report = True if args.weburl: weburl = args.weburl if args.link: hard_links = args.link # Sort out project directory project = AnalysisProject(args.project) if not project.is_analysis_dir: # Assume it's the Fastq dir fastq_dir = os.path.basename(args.project) project = AnalysisProject(os.path.dirname(args.project)) else: fastq_dir = None if not project.is_analysis_dir: logger.error("'%s': project not found" % args.project) return 1 project_name = project.name # Parent analysis directory analysis_dir = AnalysisDir(os.path.dirname(project.dirn)) # Fastqs directory try: project.use_fastq_dir(fastq_dir) except Exception as ex: logger.error("'%s': failed to load Fastq set '%s': %s" % (project.name, fastq_dir, ex)) return 1 # Report print("Transferring data from '%s' (%s)" % (project.name, project.dirn)) print("Fastqs in %s" % project.fastq_dir) # Summarise samples and Fastqs samples = set() nfastqs = 0 fsize = 0 for sample in project.samples: samples.add(sample.name) for fq in sample.fastq: fsize += os.lstat(fq).st_size nfastqs += 1 nsamples = len(samples) dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform if project.info.single_cell_platform else '', project.info.library_type) endedness = "paired-end" if project.info.paired_end else "single-end" print("%s with %d Fastqs from %d %s sample%s totalling %s" % (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '', format_file_size(fsize))) # Check target dir if not Location(target_dir).is_remote: target_dir = os.path.abspath(target_dir) if not exists(target_dir): print("'%s': target directory not found" % target_dir) return else: print("Target directory %s" % target_dir) # Locate downloader if include_downloader: print("Locating downloader for inclusion") downloader = find_program("download_fastqs.py") if downloader is None: logging.error("Unable to locate download_fastqs.py") return 1 print("... found %s" % downloader) else: downloader = None # Locate zipped QC report if include_qc_report: print("Locating zipped QC reports for inclusion") qc_zips = list() # Check QC directories and look for zipped reports for qc_dir in project.qc_dirs: # Get the associated Fastq set # NB only compare the basename of the Fastq dir # in case full paths weren't updated fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir) if fq_set == os.path.basename(project.fastq_dir): for qc_base in ( "%s_report.%s.%s" % (qc_dir, project.name, project.info.run), "%s_report.%s.%s" % (qc_dir, project.name, os.path.basename(analysis_dir.analysis_dir)), ): qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base) if os.path.exists(qc_zip): print("... found %s" % qc_zip) qc_zips.append(qc_zip) if not qc_zips: logger.error("No zipped QC reports found") return 1 else: qc_zips = None # Locate 10xGenomics outputs if args.include_10x_outputs: print("Locating outputs from 10xGenomics pipelines for " "inclusion") cellranger_dirs = list() for d in ( 'cellranger_count', 'cellranger_multi', ): cellranger_dir = os.path.join(project.dirn, d) if os.path.isdir(cellranger_dir): print("... found %s" % cellranger_dir) cellranger_dirs.append(cellranger_dir) if not cellranger_dirs: logger.error("No outputs from 10xGenomics pipelines found") return 1 else: cellranger_dirs = None # Determine subdirectory if subdir == "random_bin": # Find a random empty directory under the # target directory print("Locating random empty bin") subdirs = [ d for d in os.listdir(target_dir) if os.path.isdir(os.path.join(target_dir, d)) ] if not subdirs: print("Failed to locate subdirectories") return shuffle(subdirs) subdir = None for d in subdirs: if not os.listdir(os.path.join(target_dir, d)): # Empty bin subdir = d break if subdir is None: print("Failed to locate empty subdirectory") return print("... found '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) elif subdir == "run_id": # Construct subdirectory name based on the # run ID subdir = "{platform}_{datestamp}.{run_number}-{project}".format( platform=analysis_dir.metadata.platform.upper(), datestamp=analysis_dir.metadata.instrument_datestamp, run_number=analysis_dir.metadata.run_number, project=project.name) # Check it doesn't already exist if exists(os.path.join(target_dir, subdir)): logger.error("'%s': subdirectory already exists" % subdir) return print("Using subdirectory '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) # Make target directory if not exists(target_dir): mkdir(target_dir) # Get runner for copy job if args.runner: runner = fetch_runner(args.runner) else: runner = default_runner # Set identifier for jobs job_id = "%s%s" % (project_name, (".%s" % fastq_dir if fastq_dir is not None else '')) # Set the working directory working_dir = os.path.abspath("transfer.%s.%s" % (job_id, int(time.time()))) mkdir(working_dir) print("Created working dir %s" % working_dir) # Construct the README if readme_template: # Check that template file exists print("Locating README template") template = None for filen in ( readme_template, os.path.join(get_templates_dir(), readme_template), ): if os.path.exists(filen): template = filen break if template is None: logger.error("'%s': template file not found" % readme_template) return 1 else: readme_template = template print("... found %s" % readme_template) # Read in template with open(readme_template, 'rt') as fp: readme = fp.read() # Substitute template variables template_vars = { 'PLATFORM': analysis_dir.metadata.platform.upper(), 'RUN_NUMBER': analysis_dir.metadata.run_number, 'DATESTAMP': analysis_dir.metadata.instrument_datestamp, 'PROJECT': project_name, 'WEBURL': weburl, 'BIN': subdir, 'DIR': target_dir, 'TODAY': date.today().strftime("%d/%m/%Y"), } for var in template_vars: value = template_vars[var] if value is None: value = '?' else: value = str(value) readme = re.sub(r"%{var}%".format(var=var), value, readme) # Write out a temporary README file readme_file = os.path.join(working_dir, "README") with open(readme_file, 'wt') as fp: fp.write(readme) else: # No README readme_file = None # Start a scheduler to run jobs sched = SimpleScheduler(runner=runner, reporter=TransferDataSchedulerReporter(), poll_interval=settings.general.poll_interval) sched.start() # Build command to run manage_fastqs.py copy_cmd = Command("manage_fastqs.py") if hard_links: copy_cmd.add_args("--link") copy_cmd.add_args(analysis_dir.analysis_dir, project_name) if fastq_dir is not None: copy_cmd.add_args(fastq_dir) copy_cmd.add_args("copy", target_dir) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copy.%s" % job_id, wd=working_dir) # Copy README if readme_file is not None: print("Copying README file") copy_cmd = copy_command(readme_file, os.path.join(target_dir, "README")) sched.submit(copy_cmd.command_line, name="copy.%s.readme" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy download_fastqs.py if downloader: print("Copying downloader") copy_cmd = copy_command( downloader, os.path.join(target_dir, os.path.basename(downloader))) sched.submit(copy_cmd.command_line, name="copy.%s.downloader" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy QC reports if qc_zips: for qc_zip in qc_zips: print("Copying '%s'" % os.path.basename(qc_zip)) copy_cmd = copy_command(qc_zip, os.path.join(target_dir, os.path.basename(qc_zip)), link=hard_links) sched.submit(copy_cmd.command_line, name="copy.%s.%s" % (job_id, os.path.basename(qc_zip)), runner=SimpleJobRunner(), wd=working_dir) # Tar and copy 10xGenomics outputs if cellranger_dirs: for cellranger_dir in cellranger_dirs: print("Tar gzipping and copying '%s'" % os.path.basename(cellranger_dir)) # Tar & gzip data targz = os.path.join( working_dir, "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir), project_name, project.info.run)) targz_cmd = Command("tar", "czvhf", targz, "-C", os.path.dirname(cellranger_dir), os.path.basename(cellranger_dir)) print("Running %s" % targz_cmd) targz_job = sched.submit( targz_cmd.command_line, name="targz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), wd=working_dir) # Copy the targz file copy_cmd = copy_command( targz, os.path.join(target_dir, os.path.basename(targz))) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copytgz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), runner=SimpleJobRunner(), wd=working_dir, wait_for=(targz_job.job_name, )) # Wait for scheduler jobs to complete sched.wait() # Check exit code for Fastq copying exit_code = copy_job.exit_code if exit_code != 0: logger.error("File copy exited with an error") return exit_code else: print("Files now at %s" % target_dir) if weburl: url = weburl if subdir is not None: url = os.path.join(url, subdir) print("URL: %s" % url) print("Done")
def getrunner(self, section, option, default='SimpleJobRunner'): try: return fetch_runner(self.get(section, option, default)) except Exception: return None