def info(self): """ Report information about the directory """ # Report information print "Dir : %s" % self._dirn print "Size : %s (%s)" % (utils.format_file_size( self.size), utils.format_file_size(self.size, 'K')) print "Has cache: %s" % print_yes_no(self.has_cache) print "#files: %d" % len(self) print "File types: %s" % print_list(self.extensions) print "Compression types: %s" % print_list(self.compression) print "Users : %s" % print_list(self.users) print "Groups: %s" % print_list(self.groups) print "Oldest: %s %s" % (self.oldest.datetime.ctime(), self.oldest.relpath(self._dirn)) print "Newest: %s %s" % (self.newest.datetime.ctime(), self.newest.relpath(self._dirn)) # Top-level subdirectories print "Top-level subdirectories:" print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms" for subdir in utils.list_dirs(self._dirn): sd = DataDir(os.path.join(self._dirn, subdir), files=self.files(subdir=subdir)) print "- %s/\t%d\t%s\t%s\t%s\t%s" % ( subdir, len(sd), utils.format_file_size( sd.size), print_list(sd.extensions), print_list(sd.users), print_perms(sd.usr_unreadable, sd.grp_unreadable, sd.grp_unwritable)) # File permissions print "File permissions:" print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable) print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable) print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable) print "#Temp files: %d" % len(self.list_temp())
def get_stats_for_file(fq,read_counter=FastqReadCounter.zcat_wc): """Generate statistics for a single fastq file Given a FastqStats object, set the 'nreads' property to the number of reads and the 'fsize' property to the file size for the corresponding fastq file. Arguments: fq: FastqStats object with 'fastq' property set to the full path for a Fastq file read_counter: optional, specify function to use for counting reads in the fastq file Returns: Input FastqStats object with the 'nreads' and 'fsize' properties set. """ print "* %s: starting" % fq.name start_time = time.time() sys.stdout.flush() fq.nreads = read_counter(fq.fastq) fq.fsize = os.path.getsize(fq.fastq) print "- %s: finished" % fq.name end_time = time.time() print "- %s: %d reads, %s" % (fq.name, fq.nreads, bcf_utils.format_file_size(fq.fsize)) print "- %s: took %f.2s" % (fq.name,(end_time-start_time)) return fq
def find_tmp_files(datadir): """ Report temporary files/directories """ nfiles = 0 total_size = 0 for f in DataDir(datadir).list_temp(): size = get_size(f) total_size += size nfiles += 1 print "%s\t%s" % (os.path.relpath( f, datadir), utils.format_file_size(size)) if not nfiles: print "No files or directories found" return print "%d found, total size: %s" % (nfiles, utils.format_file_size(total_size))
def collect_fastq_data(fqstats): """ Collect data from FASTQ file in a FastqStats instance Given a FastqStats instance, collects and sets the following properties derived from the corresponding FASTQ file stored in that instance: - nreads: total number of reads - fsize: file size - reads_by_lane: (R1 FASTQs only) dictionary where keys are lane numbers and values are read counts Note that if the FASTQ file is an R2 file then the reads per lane will not be set. Arguments: fqstats (FastqStats): FastqStats instance Returns: FastqStats: input FastqStats instance with the appropriated properties updated. """ fqs = fqstats fastq = fqs.fastq fastq_name = fqs.name print "* %s: starting" % fastq_name start_time = time.time() sys.stdout.flush() if fqs.read_number == 1: # Do full processing for R1 fastqs lane = IlluminaFastq(fastq_name).lane_number if lane is not None: # Lane number is in file name fqs.reads_by_lane[lane] = \ FastqReadCounter.zcat_wc(fastq) else: # Need to get lane(s) from read headers fqs.reads_by_lane = \ FastqReadCounter.reads_per_lane(fastq) # Store total reads fqs.nreads = sum([fqs.reads_by_lane[x] for x in fqs.lanes]) else: # Only get total reads for R2 fastqs fqs.nreads = FastqReadCounter.zcat_wc(fastq) fqs.fsize = os.path.getsize(fastq) print "- %s: finished" % fastq_name end_time = time.time() print "- %s: %d reads, %s" % (fastq_name, fqs.nreads, bcf_utils.format_file_size(fqs.fsize)) print "- %s: took %.2fs" % (fastq_name,(end_time-start_time)) return fqs
def info(self): """ Report information about the directory """ # Report information print "Dir : %s" % self._dirn print "Size : %s (%s)" % (utils.format_file_size(self.size), utils.format_file_size(self.size,'K')) print "Has cache: %s" % print_yes_no(self.has_cache) print "#files: %d" % len(self) print "File types: %s" % print_list(self.extensions) print "Compression types: %s" % print_list(self.compression) print "Users : %s" % print_list(self.users) print "Groups: %s" % print_list(self.groups) print "Oldest: %s %s" % (self.oldest.datetime.ctime(),self.oldest.relpath(self._dirn)) print "Newest: %s %s" % (self.newest.datetime.ctime(),self.newest.relpath(self._dirn)) # Top-level subdirectories print "Top-level subdirectories:" print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms" for subdir in utils.list_dirs(self._dirn): sd = DataDir(os.path.join(self._dirn,subdir), files=self.files(subdir=subdir)) print "- %s/\t%d\t%s\t%s\t%s\t%s" % (subdir, len(sd), utils.format_file_size(sd.size), print_list(sd.extensions), print_list(sd.users), print_perms(sd.usr_unreadable, sd.grp_unreadable, sd.grp_unwritable)) # File permissions print "File permissions:" print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable) print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable) print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable) print "#Temp files: %d" % len(self.list_temp())
def fastq_statistics(illumina_data,n_processors=1): """Generate statistics for fastq outputs from an Illumina run Given a directory with fastq(.gz) files arranged in the same structure as the output from bcl2fastq (i.e. subdirectory 'Unaligned', then project directories within this called 'Project_<NAME>', each containing sample directories called 'Sample_<NAME>', and each of these containing fastq files), generate statistics for each file. Arguments: illumina_data: populated IlluminaData object describing the run. n_processors: number of processors to use (if 1>1 then uses the multiprocessing library to run the statistics gathering using multiple cores). Returns: Populated TabFile object containing the statistics. """ stats = TabFile.TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end')) fastqs = get_fastqs(illumina_data) if n_processors > 1: # Multiple cores pool = Pool(n_processors) results = pool.map(get_stats_for_file,fastqs) pool.close() pool.join() else: # Single core results = map(get_stats_for_file,fastqs) for fastq in results: stats.append(data=(fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, 'Y' if illumina_data.paired_end else 'N')) return stats
print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % ( fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize),
def archive(ap,archive_dir=None,platform=None,year=None, perms=None,group=None,include_bcl2fastq=False, read_only_fastqs=True,runner=None, final=False,force=False,dry_run=False): """ Copy an analysis directory and contents to an archive area Copies the contents of the analysis directory to an archive area, which can be on a local or remote system. The archive directory is constructed in the form <TOP_DIR>/<YEAR>/<PLATFORM>/<DIR>/... The YEAR and PLATFORM can be overriden using the appropriate arguments. By default the data is copied to a 'staging' directory called '__ANALYSIS_DIR.pending' in the archive directory. The archiving can be finalised by setting the 'final' argumente to 'True', which performs a last update of the staging area before moving the data to its final location. Once the archive has been finalised any further archiving attempts will be refused. Copying of the data is performed using 'rsync'; multiple archive operations mirror the contents of the analysis directory (so any data removed from the source will also be removed from the archive). By default the 'bcl2fastq' directory is omitted from the archive, unless the fastq files in any projects are links to the data. Inclusion of this directory can be forced by setting the appropriate argument. The fastqs will be switched to be read-only in the archive by default. Arguments: ap (AutoProcessor): autoprocessor pointing to the analysis directory to be archived archive_dir (str): top level archive directory, of the form '[[user@]host:]dir' (if not set then use the value from the auto_process.ini file). platform (str): set the value of the <PLATFORM> level in the archive (if not set then taken from the supplied autoprocessor instance). year (str): set the value of the <YEAR> level in the archive (if not set then defaults to the current year) (4 digits) perms (str): change the permissions of the destination files and directories according to the supplied argument (e.g. 'g+w') (if not set then use the value from the auto_process.ini file). group (str): set the group of the destination files to the supplied argument (if not set then use the value from the auto_process.ini file). include_bcl2fastq (bool): if True then force inclusion of the 'bcl2fastq' subdirectory; otherwise only include it if fastq files in project subdirectories are symlinks. read_only_fastqs (bool): if True then make the fastqs read-only in the destination directory; otherwise keep the original permissions. runner: (optional) specify a non-default job runner to use for primary data rsync final (bool): if True then finalize the archive by moving the '.pending' temporary archive to the final location force (bool): if True then do archiving even if there are errors (e.g. key metadata items not set, permission error when setting group etc); otherwise abort archiving operation. dry_run (bool): report what would be done but don't perform any operations. Returns: UNIX-style integer returncode: 0 = successful termination, non-zero indicates an error occurred. """ # Return value retval = 0 # Check if analysis dir is actually staging directory analysis_dir = os.path.basename(ap.analysis_dir) is_staging = False if analysis_dir.startswith("__") and analysis_dir.endswith(".pending"): logger.warning("Operating directly on staged directory") if not final: raise Exception("Cannot re-stage already staged " "analysis directory") else: is_staging = True # Fetch archive location if archive_dir is None: archive_dir = ap.settings.archive.dirn if archive_dir is None: raise Exception("No archive directory specified (use " "--archive_dir option?)") # Construct subdirectory structure i.e. platform and year if platform is None: platform = ap.metadata.platform if platform is None: raise Exception("No platform specified (use --platform " "option?)") if year is None: datestamp = str(ap.metadata.instrument_datestamp) if len(datestamp) == 6: # Assume YYMMDD datestamp format year = "20%s" % datestamp[0:2] elif len(datestamp) == 8: # Assume YYYYMMDD datestamp format year = datestamp[0:4] else: raise Exception("Invalid datestamp '%s' (use " "--year option)" % datestamp) archive_dir = os.path.join(archive_dir,year,platform) if not fileops.exists(archive_dir): raise OSError("Archive directory '%s' doesn't exist" % archive_dir) # Determine target directory if not is_staging: final_dest = analysis_dir staging = "__%s.pending" % analysis_dir else: final_dest = analysis_dir[len("__"):-len(".pending")] staging = analysis_dir if final: dest = final_dest else: dest = staging print("Copying to archive directory: %s" % archive_dir) print("Platform : %s" % platform) print("Year : %s" % year) print("Destination: %s %s" % (dest, "(final)" if final else "(staging)")) # Check if final archive already exists if fileops.exists(os.path.join(archive_dir,final_dest)): raise Exception("Final archive already exists, stopping") # Report available space on target filesystem usage = fileops.disk_usage(archive_dir) print("Available : %s/%s (%s%% in use)" % (format_file_size(usage.free), format_file_size(usage.total), usage.percent)) # Check metadata check_metadata = ap.check_metadata(('source','run_number')) if not check_metadata: if not force or not is_staging: raise Exception("Some metadata items not set, stopping") logger.warning("Some metadata items not set, proceeding") # Locate extra bcl2fastq directories extra_bcl2fastq_dirs = list() for dirn in list_dirs(ap.analysis_dir): if dirn.endswith(".bak") or dirn.startswith("save."): # Ignore continue elif dirn == os.path.basename(ap.params.unaligned_dir): continue # Try to load data from the directory try: illumina_data = IlluminaData(ap.analysis_dir, unaligned_dir=dirn) extra_bcl2fastq_dirs.append(dirn) except Exception: pass if not is_staging: # Are there any projects to archive? try: projects = ap.get_analysis_projects() except Exception as ex: logging.warning("Error trying to fetch analysis projects: " "%s" % ex) projects = [] if not projects: if not force: raise Exception("No project directories found, nothing " "to archive") # Check if there is a bcl2fastq directory instead unaligned_dir = ap.params.unaligned_dir if not os.path.isabs(unaligned_dir): unaligned_dir = os.path.join(ap.analysis_dir, unaligned_dir) if os.path.exists(unaligned_dir): logging.warning("No project directories found, forcing " "archiving of bcl2fastq output directory " "'%s' instead" % ap.params.unaligned_dir) include_bcl2fastq = True else: raise Exception("No project directories or bcl2fastq " "directory output found, nothing to " "archive (even with --force)") # Determine which directories to exclude excludes = ['--exclude=primary_data', '--exclude=save.*', '--exclude=*.bak', '--exclude=*.tmp', '--exclude=tmp.*', '--exclude=__*',] if not include_bcl2fastq: # Determine whether bcl2fastq dir should be included implicitly # because there are links from the analysis directories for project in projects: if project.fastqs_are_symlinks: print("Found at least one project with fastq " "symlinks (%s)" % project.name) include_bcl2fastq = True break if not include_bcl2fastq: print("Excluding '%s' directory from archive" % ap.params.unaligned_dir) excludes.append('--exclude=%s' % ap.params.unaligned_dir) # Exclude extra bcl2fastq dirs for dirn in extra_bcl2fastq_dirs: print("Excluding '%s' directory from archive" % dirn) excludes.append('--exclude=%s' % dirn) # 10xgenomics products to exclude excludes.append('--exclude=*.mro') excludes.append('--exclude=%s*' % tenx_genomics_utils.flow_cell_id(ap.run_name)) # Log dir log_dir = 'archive%s' % ('_final' if final else '_staging') if dry_run: log_dir += '_dry_run' ap.set_log_dir(ap.get_log_subdir(log_dir)) # Set up runner if runner is None: runner = ap.settings.runners.rsync runner.set_log_dir(ap.log_dir) # Setup a scheduler for multiple rsync jobs sched = simple_scheduler.SimpleScheduler( runner=runner, max_concurrent=ap.settings.general.max_concurrent_jobs, poll_interval=ap.settings.general.poll_interval) sched.start() # Keep track of jobs archiving_jobs = [] # If making fastqs read-only then transfer them separately if read_only_fastqs and final: # Make sure excluded directories are excluded extra_options = [ex for ex in excludes] # Set up to include only the fastq directories in # projects fastq_dirs = [] for project in projects: for fastq_dir in project.fastq_dirs: fastq_dirs.append(os.path.join( os.path.basename(project.dirn), fastq_dir)) # Update the extra options with includes/excludes extra_options.append('--include=*/') for fastq_dir in fastq_dirs: extra_options.append('--include=%s/**' % fastq_dir) extra_options.append('--exclude=*') # Execute the rsync rsync_fastqs = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=False, mirror=True, dry_run=dry_run, chmod='ugo-w', extra_options=extra_options) print("Running %s" % rsync_fastqs) rsync_fastqs_job = sched.submit(rsync_fastqs, name="rsync.archive_fastqs") # Exclude fastqs from main rsync for fastq_dir in fastq_dirs: excludes.append('--exclude=%s' % fastq_dir) wait_for = [rsync_fastqs_job.job_name] # Add to list of jobs archiving_jobs.append(rsync_fastqs_job) else: # No separate Fastq rsync rsync_fastqs_job = None wait_for = () # Main rsync command rsync = applications.general.rsync( "%s/" % ap.analysis_dir, os.path.join(archive_dir,staging), prune_empty_dirs=True, mirror=True, dry_run=dry_run, chmod=perms, extra_options=excludes) print("Running %s" % rsync) rsync_job = sched.submit(rsync,name="rsync.archive", wait_for=wait_for) archiving_jobs.append(rsync_job) # Wait for jobs to complete rsync_job.wait() # Check exit status on jobs for job in archiving_jobs: print("%s completed: exit code %s" % (job.name, job.exit_code)) retval = sum([j.exit_code for j in archiving_jobs]) if retval != 0: logger.warning("One or more archiving jobs failed " "(non-zero exit code returned)") else: if final: # Update the final stored Fastq paths for QC staged_analysis_dir = os.path.join( archive_dir, staging) archived_analysis_dir = os.path.abspath( os.path.join( archive_dir, final_dest)) for project in AnalysisDir(staged_analysis_dir).get_projects(): qc_info = project.qc_info(project.qc_dir) if qc_info.fastq_dir: print("%s: updating stored Fastq directory for QC" % project.name) new_fastq_dir = os.path.join(archived_analysis_dir, os.path.relpath( qc_info.fastq_dir, ap.analysis_dir)) print("-- updated Fastq directory: %s" % new_fastq_dir) qc_info['fastq_dir'] = new_fastq_dir qc_info.save() # Set the group if group is not None: print("Setting group of archived files to '%s'" % group) if not dry_run: set_group = fileops.set_group_command( group, os.path.join(archive_dir,staging), safe=force, verbose=True) print("Running %s" % set_group) set_group_job = sched.submit( set_group, name="set_group.archive") set_group_job.wait() # Check exit status exit_code = set_group_job.exit_code print("%s completed: exit code %s" % ( set_group_job.name, exit_code)) if exit_code != 0: logger.warning("Setting group failed (non-zero " "exit status code returned)") retval = retval + exit_code # Finish with scheduler sched.wait() sched.stop() # Bail out if there was a problem if retval != 0: if not force: raise Exception("Staging to archive failed") else: logger.warning("Staging to archive failed (ignored)") # Move to final location if final: print("Moving to final location: %s" % final_dest) if not dry_run: fileops.rename(os.path.join(archive_dir,staging), os.path.join(archive_dir,final_dest)) # Report usage of target filesystem usage = fileops.disk_usage(archive_dir) print("Usage of archive: %s available (of %s) (%s%% in use)" % (format_file_size(usage.free), format_file_size(usage.total), usage.percent)) # Finish return retval
# Look for fastqs fastqs = [] for data in sequencing_data: print "%s" % data.unaligned_dir fastqs.extend(get_fastqs(data,project_pattern=project_pattern, sample_pattern=sample_pattern)) if not fastqs: logging.error("No matching Fastqs found") sys.exit(1) # Report file sizes total_size = 0 for fq in fastqs: fsize = os.lstat(fq).st_size total_size += fsize print "%s\t%s" % (os.path.basename(fq), bcf_utils.format_file_size(fsize)) print "Total: %s" % bcf_utils.format_file_size(total_size) # Generate MD5 checksum file if not options.dry_run: tmpdir = tempfile.mkdtemp(suffix='checksums.md5', dir=os.getcwd()) md5_file = os.path.join(tmpdir,'checksums.md5') print "Generating MD5 sums in %s" % md5_file fp = open(md5_file,'w') for fq in fastqs: chksum = Md5sum.md5sum(fq) fp.write("%s %s\n" % (chksum,os.path.basename(fq))) fp.close() # Copy the fastqs print "Copying fastqs" for fq in fastqs:
total_size = 0 n_fastqs = 0 sample_names = set() # Collect information fastq_set = os.path.relpath(project.fastq_dir, project.dirn) print "Fastq set: %s%s" % ( ("default" if fastq_set == "fastqs" else fastq_set), (" (primary)" if fastq_set == project.info.primary_fastq_dir else "")) for sample_name, fastq, fq in get_fastqs(project, pattern=options.pattern): # File size fsize = os.lstat(fq).st_size print "%s\t%s%s\t%s" % (sample_name, os.path.basename(fq), ('*' if os.path.islink(fastq) else ''), bcf_utils.format_file_size(fsize)) sample_names.add(sample_name) total_size += fsize n_fastqs += 1 # Summary print "Total:\t%s" % bcf_utils.format_file_size(total_size) print "%d %ssamples" % (len(sample_names), ('paired-end ' if project.info.paired_end else '')) print "%d fastqs" % n_fastqs sys.exit(0) # Perform command if cmd not in ('copy', 'zip', 'md5'): p.error("Unrecognised command '%s'\n" % cmd) sys.exit(1) if cmd == 'copy':
# Report the names of the samples in each project if report: for project in illumina_data.projects: print("%s" % IlluminaData.describe_project(project)) # Report statistics for fastq files if args.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print( "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads)) print("") # Summary: short report suitable for logging file if args.summary: print("%s" % IlluminaData.summarise_projects(illumina_data)) # Print number of undetermined reads if args.stats and illumina_data.undetermined is not None: print("Undetermined indices") for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn, fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print("%s\t%s\t%d" %
def main(): """ """ # Load configuration settings = Settings() # Collect defaults default_runner = settings.runners.rsync # Get pre-defined destinations destinations = [name for name in settings.destination] # Command line p = argparse.ArgumentParser( description="Transfer copies of Fastq data from an analysis " "project to an arbitrary destination for sharing with other " "people") p.add_argument('--version', action='version', version=("%%(prog)s %s" % get_version())) p.add_argument('--subdir', action='store', choices=('random_bin', 'run_id'), default=None, help="subdirectory naming scheme: 'random_bin' " "locates a random pre-existing empty subdirectory " "under the target directory; 'run_id' creates a " "new subdirectory " "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this " "option is not set then no subdirectory will be " "used") p.add_argument('--readme', action='store', metavar='README_TEMPLATE', dest='readme_template', help="template file to generate README file from; " "can be full path to a template file, or the name " "of a file in the 'templates' directory") p.add_argument('--weburl', action='store', help="base URL for webserver (sets the value of " "the WEBURL variable in the template README)") p.add_argument('--include_downloader', action='store_true', help="copy the 'download_fastqs.py' utility to the " "final location") p.add_argument('--include_qc_report', action='store_true', help="copy the zipped QC reports to the final " "location") p.add_argument('--include_10x_outputs', action='store_true', help="copy outputs from 10xGenomics pipelines (e.g. " "'cellranger count') to the final location") p.add_argument('--link', action='store_true', help="hard link files instead of copying") p.add_argument('--runner', action='store', help="specify the job runner to use for executing " "the checksumming, Fastq copy and tar gzipping " "operations (defaults to job runner defined for " "copying in config file [%s])" % default_runner) p.add_argument('dest', action='store', metavar="DEST", help="destination to copy Fastqs to; can be the " "name of a destination defined in the configuration " "file, or an arbitrary location of the form " "'[[USER@]HOST:]DIR' (%s)" % (("available destinations: %s" % (','.join("'%s'" % d for d in sorted(destinations)))) if destinations else "no destinations currently defined")) p.add_argument('project', action='store', metavar="PROJECT", help="path to project directory (or to a Fastqs " "subdirectory in a project) to copy Fastqs from") # Process command line args = p.parse_args() # Check if target is pre-defined destination if args.dest in destinations: print("Loading settings for destination '%s'" % args.dest) dest = settings.destination[args.dest] target_dir = dest.directory readme_template = dest.readme_template subdir = dest.subdir include_downloader = dest.include_downloader include_qc_report = dest.include_qc_report hard_links = dest.hard_links weburl = dest.url else: target_dir = args.dest readme_template = None subdir = None include_downloader = False include_qc_report = False hard_links = False weburl = None # Update defaults with command line values if args.readme_template: readme_template = args.readme_template if args.subdir: subdir = args.subdir if args.include_downloader: include_downloader = True if args.include_qc_report: include_qc_report = True if args.weburl: weburl = args.weburl if args.link: hard_links = args.link # Sort out project directory project = AnalysisProject(args.project) if not project.is_analysis_dir: # Assume it's the Fastq dir fastq_dir = os.path.basename(args.project) project = AnalysisProject(os.path.dirname(args.project)) else: fastq_dir = None if not project.is_analysis_dir: logger.error("'%s': project not found" % args.project) return 1 project_name = project.name # Parent analysis directory analysis_dir = AnalysisDir(os.path.dirname(project.dirn)) # Fastqs directory try: project.use_fastq_dir(fastq_dir) except Exception as ex: logger.error("'%s': failed to load Fastq set '%s': %s" % (project.name, fastq_dir, ex)) return 1 # Report print("Transferring data from '%s' (%s)" % (project.name, project.dirn)) print("Fastqs in %s" % project.fastq_dir) # Summarise samples and Fastqs samples = set() nfastqs = 0 fsize = 0 for sample in project.samples: samples.add(sample.name) for fq in sample.fastq: fsize += os.lstat(fq).st_size nfastqs += 1 nsamples = len(samples) dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform if project.info.single_cell_platform else '', project.info.library_type) endedness = "paired-end" if project.info.paired_end else "single-end" print("%s with %d Fastqs from %d %s sample%s totalling %s" % (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '', format_file_size(fsize))) # Check target dir if not Location(target_dir).is_remote: target_dir = os.path.abspath(target_dir) if not exists(target_dir): print("'%s': target directory not found" % target_dir) return else: print("Target directory %s" % target_dir) # Locate downloader if include_downloader: print("Locating downloader for inclusion") downloader = find_program("download_fastqs.py") if downloader is None: logging.error("Unable to locate download_fastqs.py") return 1 print("... found %s" % downloader) else: downloader = None # Locate zipped QC report if include_qc_report: print("Locating zipped QC reports for inclusion") qc_zips = list() # Check QC directories and look for zipped reports for qc_dir in project.qc_dirs: # Get the associated Fastq set # NB only compare the basename of the Fastq dir # in case full paths weren't updated fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir) if fq_set == os.path.basename(project.fastq_dir): for qc_base in ( "%s_report.%s.%s" % (qc_dir, project.name, project.info.run), "%s_report.%s.%s" % (qc_dir, project.name, os.path.basename(analysis_dir.analysis_dir)), ): qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base) if os.path.exists(qc_zip): print("... found %s" % qc_zip) qc_zips.append(qc_zip) if not qc_zips: logger.error("No zipped QC reports found") return 1 else: qc_zips = None # Locate 10xGenomics outputs if args.include_10x_outputs: print("Locating outputs from 10xGenomics pipelines for " "inclusion") cellranger_dirs = list() for d in ( 'cellranger_count', 'cellranger_multi', ): cellranger_dir = os.path.join(project.dirn, d) if os.path.isdir(cellranger_dir): print("... found %s" % cellranger_dir) cellranger_dirs.append(cellranger_dir) if not cellranger_dirs: logger.error("No outputs from 10xGenomics pipelines found") return 1 else: cellranger_dirs = None # Determine subdirectory if subdir == "random_bin": # Find a random empty directory under the # target directory print("Locating random empty bin") subdirs = [ d for d in os.listdir(target_dir) if os.path.isdir(os.path.join(target_dir, d)) ] if not subdirs: print("Failed to locate subdirectories") return shuffle(subdirs) subdir = None for d in subdirs: if not os.listdir(os.path.join(target_dir, d)): # Empty bin subdir = d break if subdir is None: print("Failed to locate empty subdirectory") return print("... found '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) elif subdir == "run_id": # Construct subdirectory name based on the # run ID subdir = "{platform}_{datestamp}.{run_number}-{project}".format( platform=analysis_dir.metadata.platform.upper(), datestamp=analysis_dir.metadata.instrument_datestamp, run_number=analysis_dir.metadata.run_number, project=project.name) # Check it doesn't already exist if exists(os.path.join(target_dir, subdir)): logger.error("'%s': subdirectory already exists" % subdir) return print("Using subdirectory '%s'" % subdir) # Update target dir target_dir = os.path.join(target_dir, subdir) # Make target directory if not exists(target_dir): mkdir(target_dir) # Get runner for copy job if args.runner: runner = fetch_runner(args.runner) else: runner = default_runner # Set identifier for jobs job_id = "%s%s" % (project_name, (".%s" % fastq_dir if fastq_dir is not None else '')) # Set the working directory working_dir = os.path.abspath("transfer.%s.%s" % (job_id, int(time.time()))) mkdir(working_dir) print("Created working dir %s" % working_dir) # Construct the README if readme_template: # Check that template file exists print("Locating README template") template = None for filen in ( readme_template, os.path.join(get_templates_dir(), readme_template), ): if os.path.exists(filen): template = filen break if template is None: logger.error("'%s': template file not found" % readme_template) return 1 else: readme_template = template print("... found %s" % readme_template) # Read in template with open(readme_template, 'rt') as fp: readme = fp.read() # Substitute template variables template_vars = { 'PLATFORM': analysis_dir.metadata.platform.upper(), 'RUN_NUMBER': analysis_dir.metadata.run_number, 'DATESTAMP': analysis_dir.metadata.instrument_datestamp, 'PROJECT': project_name, 'WEBURL': weburl, 'BIN': subdir, 'DIR': target_dir, 'TODAY': date.today().strftime("%d/%m/%Y"), } for var in template_vars: value = template_vars[var] if value is None: value = '?' else: value = str(value) readme = re.sub(r"%{var}%".format(var=var), value, readme) # Write out a temporary README file readme_file = os.path.join(working_dir, "README") with open(readme_file, 'wt') as fp: fp.write(readme) else: # No README readme_file = None # Start a scheduler to run jobs sched = SimpleScheduler(runner=runner, reporter=TransferDataSchedulerReporter(), poll_interval=settings.general.poll_interval) sched.start() # Build command to run manage_fastqs.py copy_cmd = Command("manage_fastqs.py") if hard_links: copy_cmd.add_args("--link") copy_cmd.add_args(analysis_dir.analysis_dir, project_name) if fastq_dir is not None: copy_cmd.add_args(fastq_dir) copy_cmd.add_args("copy", target_dir) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copy.%s" % job_id, wd=working_dir) # Copy README if readme_file is not None: print("Copying README file") copy_cmd = copy_command(readme_file, os.path.join(target_dir, "README")) sched.submit(copy_cmd.command_line, name="copy.%s.readme" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy download_fastqs.py if downloader: print("Copying downloader") copy_cmd = copy_command( downloader, os.path.join(target_dir, os.path.basename(downloader))) sched.submit(copy_cmd.command_line, name="copy.%s.downloader" % job_id, runner=SimpleJobRunner(), wd=working_dir) # Copy QC reports if qc_zips: for qc_zip in qc_zips: print("Copying '%s'" % os.path.basename(qc_zip)) copy_cmd = copy_command(qc_zip, os.path.join(target_dir, os.path.basename(qc_zip)), link=hard_links) sched.submit(copy_cmd.command_line, name="copy.%s.%s" % (job_id, os.path.basename(qc_zip)), runner=SimpleJobRunner(), wd=working_dir) # Tar and copy 10xGenomics outputs if cellranger_dirs: for cellranger_dir in cellranger_dirs: print("Tar gzipping and copying '%s'" % os.path.basename(cellranger_dir)) # Tar & gzip data targz = os.path.join( working_dir, "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir), project_name, project.info.run)) targz_cmd = Command("tar", "czvhf", targz, "-C", os.path.dirname(cellranger_dir), os.path.basename(cellranger_dir)) print("Running %s" % targz_cmd) targz_job = sched.submit( targz_cmd.command_line, name="targz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), wd=working_dir) # Copy the targz file copy_cmd = copy_command( targz, os.path.join(target_dir, os.path.basename(targz))) print("Running %s" % copy_cmd) copy_job = sched.submit(copy_cmd.command_line, name="copytgz.%s.%s" % (job_id, os.path.basename(cellranger_dir)), runner=SimpleJobRunner(), wd=working_dir, wait_for=(targz_job.job_name, )) # Wait for scheduler jobs to complete sched.wait() # Check exit code for Fastq copying exit_code = copy_job.exit_code if exit_code != 0: logger.error("File copy exited with an error") return exit_code else: print("Files now at %s" % target_dir) if weburl: url = weburl if subdir is not None: url = os.path.join(url, subdir) print("URL: %s" % url) print("Done")
print "\t\t%s" % fastq # Report the names of the samples in each project if options.report: for project in illumina_data.projects: print "%s" % IlluminaData.describe_project(project) # Report statistics for fastq files if options.stats: # Print number of reads for each file, and file size for sample in project.samples: for fastq in sample.fastq: fq = os.path.join(sample.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq) print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize), nreads) print "" # Summary: short report suitable for logging file if options.summary: print "%s" % IlluminaData.summarise_projects(illumina_data) # Print number of undetermined reads if options.stats and illumina_data.undetermined is not None: print "Undetermined indices" for lane in illumina_data.undetermined.samples: for fastq in lane.fastq: fq = os.path.join(lane.dirn,fastq) nreads = FASTQFile.nreads(fq) fsize = os.path.getsize(fq)
def list_files(datadir, extensions=None, owners=None, groups=None, compression=None, subdir=None, sort_keys=None, min_size=None, fields=('owner', 'group', 'relpath', 'size'), delimiter='\t'): """ Report files owned by specific users and/or groups 'fields' is a list of attributes to display for each file, in the specified order. The available fields are: 'owner' - User who owns the file 'group' - Group the file belongs to 'path' - Full path 'relpath' - Relative path 'size' - File size (human readable) """ # Check the fields for field in fields: if field not in ( 'owner', 'group', 'path', 'relpath', 'size', ): raise Exception("Unrecognised field: '%s'" % field) # Collect files and report nfiles = 0 total_size = 0 if min_size: min_size = convert_size(min_size) for f in DataDir(datadir).files(extensions=extensions, compression=compression, owners=owners, groups=groups, subdir=subdir, sort_keys=sort_keys): if min_size and f.size < min_size: continue total_size += f.size nfiles += 1 # Assemble line from fields line = [] for field in fields: if field == 'owner': line.append(f.user) elif field == 'group': line.append(f.group) elif field == 'path': line.append("%s%s" % (f.path, f.classifier)) elif field == 'relpath': line.append("%s%s" % (f.relpath(datadir), f.classifier)) elif field == 'size': line.append(utils.format_file_size(f.size)) print delimiter.join([str(x) for x in line]) if not nfiles: print "No files found" return print "%d found, total size: %s" % (nfiles, utils.format_file_size(total_size))
for project in unassigned: print("%s: %s" % (project.name, project.info.run)) sys.exit(0) # Report if no PIs were found if len(pi_list) == 0: print("No projects assigned to PIs found") sys.exit(0) # Report PIs, projects etc print("Summary (PI, # of projects, total usage):") print("=========================================") total_projects = 0 total_size = 0 for pi in pi_list: n_projects = len(audit_data[pi]) size = sum([p[1] for p in audit_data[pi]]) print("%s\t%d\t%s" % (pi, n_projects, utils.format_file_size(size))) total_projects += n_projects total_size += size print("Total usage\t%d\t%s" % (total_projects, utils.format_file_size(total_size))) print("\nBreakdown by PI/project:") print("========================") for pi in pi_list: print("%s:" % pi) for project, size in audit_data[pi]: print( "\t%s:\t%s\t%s" % (project.info.run, project.name, utils.format_file_size(size))) if undetermined: print("\nUsage for 'undetermined' reads:") print("===============================")
def _get_data(self, filen=None): """ Collect statistics for FASTQ outputs from an Illumina run """ # Collect FASTQ files fastqstats = [] for project in self._illumina_data.projects: for sample in project.samples: for fastq in sample.fastq: fastqstats.append( FastqStats(os.path.join(sample.dirn, fastq), project.name, sample.name)) # Gather same information for undetermined reads (if present) if self._illumina_data.undetermined is not None: for lane in self._illumina_data.undetermined.samples: for fastq in lane.fastq: fastqstats.append( FastqStats(os.path.join(lane.dirn, fastq), self._illumina_data.undetermined.name, lane.name)) # Collect the data for each file if self._n_processors > 1: # Multiple cores pool = Pool(self._n_processors) results = pool.map(collect_fastq_data, fastqstats) pool.close() pool.join() else: # Single core results = map(collect_fastq_data, fastqstats) # Set up tabfile to hold pre-existing data if filen is not None: existing_stats = TabFile(filen, first_line_is_header=True) else: existing_stats = None # Set up class to hold all collected data self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq', 'Size', 'Nreads', 'Paired_end', 'Read_number')) # Split result sets into R1 and R2 results_r1 = filter(lambda f: f.read_number == 1, results) results_r2 = filter(lambda f: f.read_number == 2, results) # Determine which lanes are present and append # columns for each lanes = set() for fastq in results_r1: logger.debug("-- %s: lanes %s" % (fastq.name, ','.join([str(l) for l in fastq.lanes]))) for lane in fastq.lanes: lanes.add(lane) # Add lane numbers from pre-existing stats file if existing_stats is not None: for c in existing_stats.header(): if c.startswith('L'): lanes.add(int(c[1:])) self._lanes = sorted(list(lanes)) logger.debug("Lanes found: %s" % ','.join([str(l) for l in self._lanes])) for lane in self._lanes: self._stats.appendColumn("L%s" % lane) # Copy pre-existing stats into new tabfile if existing_stats: for line in existing_stats: data = [ line['Project'], line['Sample'], line['Fastq'], line['Size'], line['Nreads'], line['Paired_end'], line['Read_number'] ] for lane in lanes: try: data.append(line["L%s" % lane]) except: data.append('') self._stats.append(data=data) # Copy reads per lane from R1 FASTQs into R2 for r2_fastq in results_r2: # Get corresponding R1 name logger.debug("-- Fastq R2: %s" % r2_fastq.name) r1_fastq_name = IlluminaFastq(r2_fastq.name) r1_fastq_name.read_number = 1 r1_fastq_name = str(r1_fastq_name) logger.debug("-- -> R1: %s" % r1_fastq_name) # Locate corresponding data r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name), results_r1)[0] r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane) # Write the data into the tabfile paired_end = ('Y' if self._illumina_data.paired_end else 'N') for fastq in results: # Check for existing entry existing_entry = False for line in self._stats: if (line['Project'] == fastq.project and line['Sample'] == fastq.sample and line['Fastq'] == fastq.name): # Overwrite the existing entry existing_entry = True break # Write the data if not existing_entry: # Append new entry data = [ fastq.project, fastq.sample, fastq.name, bcf_utils.format_file_size(fastq.fsize), fastq.nreads, paired_end, fastq.read_number ] for lane in lanes: try: data.append(fastq.reads_by_lane[lane]) except: data.append('') self._stats.append(data=data) else: # Overwrite existing entry logging.warning("Overwriting exisiting entry for " "%s/%s/%s" % (fastq.project, fastq.sample, fastq.name)) line['Size'] = bcf_utils.format_file_size(fastq.fsize) line['Nreads'] = fastq.nreads line['Paired_end'] = paired_end line['Read_number'] = fastq.read_number for lane in lanes: lane_name = "L%d" % lane try: line[lane_name] = fastq.reads_by_lane[lane] except: line[lane_name] = ''
print "%s: %s" % (project.name,project.info.run) sys.exit(0) # Report if no PIs were found if len(pi_list) == 0: print "No projects assigned to PIs found" sys.exit(0) # Report PIs, projects etc print "Summary (PI, # of projects, total usage):" print "=========================================" total_projects = 0 total_size = 0 for pi in pi_list: n_projects = len(audit_data[pi]) size = sum([p[1] for p in audit_data[pi]]) print "%s\t%d\t%s" % (pi,n_projects, utils.format_file_size(size)) total_projects += n_projects total_size += size print "Total usage\t%d\t%s" % (total_projects, utils.format_file_size(total_size)) print "\nBreakdown by PI/project:" print "========================" for pi in pi_list: print "%s:" % pi for project,size in audit_data[pi]: print "\t%s:\t%s\t%s" % (project.info.run,project.name, utils.format_file_size(size)) if undetermined: print "\nUsage for 'undetermined' reads:" print "===============================" total_size = 0