Example #1
0
 def info(self):
     """
     Report information about the directory 
     """
     # Report information
     print "Dir   : %s" % self._dirn
     print "Size  : %s (%s)" % (utils.format_file_size(
         self.size), utils.format_file_size(self.size, 'K'))
     print "Has cache: %s" % print_yes_no(self.has_cache)
     print "#files: %d" % len(self)
     print "File types: %s" % print_list(self.extensions)
     print "Compression types: %s" % print_list(self.compression)
     print "Users : %s" % print_list(self.users)
     print "Groups: %s" % print_list(self.groups)
     print "Oldest: %s %s" % (self.oldest.datetime.ctime(),
                              self.oldest.relpath(self._dirn))
     print "Newest: %s %s" % (self.newest.datetime.ctime(),
                              self.newest.relpath(self._dirn))
     # Top-level subdirectories
     print "Top-level subdirectories:"
     print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms"
     for subdir in utils.list_dirs(self._dirn):
         sd = DataDir(os.path.join(self._dirn, subdir),
                      files=self.files(subdir=subdir))
         print "- %s/\t%d\t%s\t%s\t%s\t%s" % (
             subdir, len(sd), utils.format_file_size(
                 sd.size), print_list(sd.extensions), print_list(sd.users),
             print_perms(sd.usr_unreadable, sd.grp_unreadable,
                         sd.grp_unwritable))
     # File permissions
     print "File permissions:"
     print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable)
     print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable)
     print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable)
     print "#Temp files: %d" % len(self.list_temp())
def get_stats_for_file(fq,read_counter=FastqReadCounter.zcat_wc):
    """Generate statistics for a single fastq file

    Given a FastqStats object, set the 'nreads' property to
    the number of reads and the 'fsize' property to the file
    size for the corresponding fastq file.

    Arguments:
      fq: FastqStats object with 'fastq' property set to the
        full path for a Fastq file
      read_counter: optional, specify function to use for
        counting reads in the fastq file

    Returns:
      Input FastqStats object with the 'nreads' and 'fsize'
      properties set.

    """
    print "* %s: starting" % fq.name
    start_time = time.time()
    sys.stdout.flush()
    fq.nreads = read_counter(fq.fastq)
    fq.fsize = os.path.getsize(fq.fastq)
    print "- %s: finished" % fq.name
    end_time = time.time()
    print "- %s: %d reads, %s" % (fq.name,
                                  fq.nreads,
                                  bcf_utils.format_file_size(fq.fsize))
    print "- %s: took %f.2s" % (fq.name,(end_time-start_time))
    return fq
Example #3
0
def find_tmp_files(datadir):
    """
    Report temporary files/directories

    """
    nfiles = 0
    total_size = 0
    for f in DataDir(datadir).list_temp():
        size = get_size(f)
        total_size += size
        nfiles += 1
        print "%s\t%s" % (os.path.relpath(
            f, datadir), utils.format_file_size(size))
    if not nfiles:
        print "No files or directories found"
        return
    print "%d found, total size: %s" % (nfiles,
                                        utils.format_file_size(total_size))
Example #4
0
def collect_fastq_data(fqstats):
    """
    Collect data from FASTQ file in a FastqStats instance

    Given a FastqStats instance, collects and sets the
    following properties derived from the corresponding
    FASTQ file stored in that instance:

    - nreads: total number of reads
    - fsize: file size
    - reads_by_lane: (R1 FASTQs only) dictionary
      where keys are lane numbers and values are
      read counts

    Note that if the FASTQ file is an R2 file then the
    reads per lane will not be set.

    Arguments:
      fqstats (FastqStats): FastqStats instance

    Returns:
      FastqStats: input FastqStats instance with the
        appropriated properties updated.
    """
    fqs = fqstats
    fastq = fqs.fastq
    fastq_name = fqs.name
    print "* %s: starting" % fastq_name
    start_time = time.time()
    sys.stdout.flush()
    if fqs.read_number == 1:
        # Do full processing for R1 fastqs
        lane = IlluminaFastq(fastq_name).lane_number
        if lane is not None:
            # Lane number is in file name
            fqs.reads_by_lane[lane] = \
                FastqReadCounter.zcat_wc(fastq)
        else:
            # Need to get lane(s) from read headers
            fqs.reads_by_lane = \
                FastqReadCounter.reads_per_lane(fastq)
        # Store total reads
        fqs.nreads = sum([fqs.reads_by_lane[x]
                          for x in fqs.lanes])
    else:
        # Only get total reads for R2 fastqs
        fqs.nreads = FastqReadCounter.zcat_wc(fastq)
    fqs.fsize = os.path.getsize(fastq)
    print "- %s: finished" % fastq_name
    end_time = time.time()
    print "- %s: %d reads, %s" % (fastq_name,
                                  fqs.nreads,
                                  bcf_utils.format_file_size(fqs.fsize))
    print "- %s: took %.2fs" % (fastq_name,(end_time-start_time))
    return fqs
Example #5
0
 def info(self):
     """
     Report information about the directory 
     """
     # Report information
     print "Dir   : %s" % self._dirn
     print "Size  : %s (%s)" % (utils.format_file_size(self.size),
                                utils.format_file_size(self.size,'K'))
     print "Has cache: %s" % print_yes_no(self.has_cache)
     print "#files: %d" % len(self)
     print "File types: %s" % print_list(self.extensions)
     print "Compression types: %s" % print_list(self.compression)
     print "Users : %s" % print_list(self.users)
     print "Groups: %s" % print_list(self.groups)
     print "Oldest: %s %s" % (self.oldest.datetime.ctime(),self.oldest.relpath(self._dirn))
     print "Newest: %s %s" % (self.newest.datetime.ctime(),self.newest.relpath(self._dirn))
     # Top-level subdirectories
     print "Top-level subdirectories:"
     print "# Dir\tFiles\tSize\tFile types\tUsers\tPerms"
     for subdir in  utils.list_dirs(self._dirn):
         sd = DataDir(os.path.join(self._dirn,subdir),
                      files=self.files(subdir=subdir))
         print "- %s/\t%d\t%s\t%s\t%s\t%s" % (subdir,
                                              len(sd),
                                              utils.format_file_size(sd.size),
                                              print_list(sd.extensions),
                                              print_list(sd.users),
                                              print_perms(sd.usr_unreadable,
                                                          sd.grp_unreadable,
                                                          sd.grp_unwritable))
     # File permissions
     print "File permissions:"
     print "- unreadable by owner: %s" % print_yes_no(self.usr_unreadable)
     print "- unreadable by group: %s" % print_yes_no(self.grp_unreadable)
     print "- unwritable by group: %s" % print_yes_no(self.grp_unwritable)
     print "#Temp files: %d" % len(self.list_temp())
def fastq_statistics(illumina_data,n_processors=1):
    """Generate statistics for fastq outputs from an Illumina run

    Given a directory with fastq(.gz) files arranged in the same
    structure as the output from bcl2fastq (i.e. subdirectory
    'Unaligned', then project directories within this called
    'Project_<NAME>', each containing sample directories called
    'Sample_<NAME>', and each of these containing fastq files),
    generate statistics for each file.

    Arguments:
      illumina_data: populated IlluminaData object describing the
        run.
      n_processors: number of processors to use (if 1>1 then uses
        the multiprocessing library to run the statistics gathering
        using multiple cores).

    Returns:
      Populated TabFile object containing the statistics.

    """
    stats = TabFile.TabFile(column_names=('Project',
                                          'Sample',
                                          'Fastq',
                                          'Size',
                                          'Nreads',
                                          'Paired_end'))
    fastqs = get_fastqs(illumina_data)
    if n_processors > 1:
        # Multiple cores
        pool = Pool(n_processors)
        results = pool.map(get_stats_for_file,fastqs)
        pool.close()
        pool.join()
    else:
        # Single core
        results = map(get_stats_for_file,fastqs)
    for fastq in results:
        stats.append(data=(fastq.project,
                           fastq.sample,
                           fastq.name,
                           bcf_utils.format_file_size(fastq.fsize),
                           fastq.nreads,
                           'Y' if illumina_data.paired_end else 'N'))
    return stats
Example #7
0
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn, fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (
                            fastq, bcf_utils.format_file_size(fsize), nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn, fastq)
                nreads = FASTQFile.nreads(fq)
                fsize = os.path.getsize(fq)
                print "%s\t%s\t%d" % (fastq, bcf_utils.format_file_size(fsize),
Example #8
0
def archive(ap,archive_dir=None,platform=None,year=None,
            perms=None,group=None,include_bcl2fastq=False,
            read_only_fastqs=True,runner=None,
            final=False,force=False,dry_run=False):
    """
    Copy an analysis directory and contents to an archive area

    Copies the contents of the analysis directory to an archive
    area, which can be on a local or remote system.

    The archive directory is constructed in the form

    <TOP_DIR>/<YEAR>/<PLATFORM>/<DIR>/...

    The YEAR and PLATFORM can be overriden using the appropriate
    arguments.

    By default the data is copied to a 'staging' directory
    called '__ANALYSIS_DIR.pending' in the archive directory.
    The archiving can be finalised by setting the 'final'
    argumente to 'True', which performs a last update of the
    staging area before moving the data to its final location.

    Once the archive has been finalised any further archiving
    attempts will be refused.

    Copying of the data is performed using 'rsync'; multiple
    archive operations mirror the contents of the analysis
    directory (so any data removed from the source will also
    be removed from the archive).

    By default the 'bcl2fastq' directory is omitted from the
    archive, unless the fastq files in any projects are links to
    the data. Inclusion of this directory can be forced by
    setting the appropriate argument.

    The fastqs will be switched to be read-only in the archive
    by default.

    Arguments:
      ap (AutoProcessor): autoprocessor pointing to the
        analysis directory to be archived
      archive_dir (str): top level archive directory, of the
        form '[[user@]host:]dir' (if not set then use the value
        from the auto_process.ini file).
      platform (str): set the value of the <PLATFORM> level in
        the archive (if not set then taken from the supplied
        autoprocessor instance).
      year (str): set the value of the <YEAR> level in the
        archive (if not set then defaults to the current year)
        (4 digits)
      perms (str): change the permissions of the destination
        files and directories according to the supplied
        argument (e.g. 'g+w') (if not set then use the value
         from the auto_process.ini file).
      group (str): set the group of the destination files to
        the supplied argument (if not set then use the value
        from the auto_process.ini file).
      include_bcl2fastq (bool): if True then force inclusion
        of the 'bcl2fastq' subdirectory; otherwise only include
        it if fastq files in project subdirectories are symlinks.
      read_only_fastqs (bool): if True then make the fastqs
        read-only in the destination directory; otherwise keep
        the original permissions.
      runner: (optional) specify a non-default job runner to use
        for primary data rsync
      final (bool): if True then finalize the archive by
        moving the '.pending' temporary archive to the final
        location
      force (bool): if True then do archiving even if there are
        errors (e.g. key metadata items not set, permission error
        when setting group etc); otherwise abort archiving
        operation.
      dry_run (bool): report what would be done but don't
        perform any operations.

    Returns:
      UNIX-style integer returncode: 0 = successful termination,
        non-zero indicates an error occurred.
    """
    # Return value
    retval = 0
    # Check if analysis dir is actually staging directory
    analysis_dir = os.path.basename(ap.analysis_dir)
    is_staging = False
    if analysis_dir.startswith("__") and analysis_dir.endswith(".pending"):
        logger.warning("Operating directly on staged directory")
        if not final:
            raise Exception("Cannot re-stage already staged "
                            "analysis directory")
        else:
            is_staging = True
    # Fetch archive location
    if archive_dir is None:
        archive_dir = ap.settings.archive.dirn
    if archive_dir is None:
        raise Exception("No archive directory specified (use "
                        "--archive_dir option?)")
    # Construct subdirectory structure i.e. platform and year
    if platform is None:
        platform = ap.metadata.platform
    if platform is None:
        raise Exception("No platform specified (use --platform "
                        "option?)")
    if year is None:
        datestamp = str(ap.metadata.instrument_datestamp)
        if len(datestamp) == 6:
            # Assume YYMMDD datestamp format
            year = "20%s" % datestamp[0:2]
        elif len(datestamp) == 8:
            # Assume YYYYMMDD datestamp format
            year = datestamp[0:4]
        else:
            raise Exception("Invalid datestamp '%s' (use "
                            "--year option)" % datestamp)
    archive_dir = os.path.join(archive_dir,year,platform)
    if not fileops.exists(archive_dir):
        raise OSError("Archive directory '%s' doesn't exist" %
                      archive_dir)
    # Determine target directory
    if not is_staging:
        final_dest = analysis_dir
        staging = "__%s.pending" % analysis_dir
    else:
        final_dest = analysis_dir[len("__"):-len(".pending")]
        staging = analysis_dir
    if final:
        dest = final_dest
    else:
        dest = staging
    print("Copying to archive directory: %s" % archive_dir)
    print("Platform   : %s" % platform)
    print("Year       : %s" % year)
    print("Destination: %s %s" % (dest,
                                  "(final)" if final else
                                  "(staging)"))
    # Check if final archive already exists
    if fileops.exists(os.path.join(archive_dir,final_dest)):
        raise Exception("Final archive already exists, stopping")
    # Report available space on target filesystem
    usage = fileops.disk_usage(archive_dir)
    print("Available  : %s/%s (%s%% in use)" %
          (format_file_size(usage.free),
           format_file_size(usage.total),
           usage.percent))
    # Check metadata
    check_metadata = ap.check_metadata(('source','run_number'))
    if not check_metadata:
        if not force or not is_staging:
            raise Exception("Some metadata items not set, stopping")
        logger.warning("Some metadata items not set, proceeding")
    # Locate extra bcl2fastq directories
    extra_bcl2fastq_dirs = list()
    for dirn in list_dirs(ap.analysis_dir):
        if dirn.endswith(".bak") or dirn.startswith("save."):
            # Ignore
            continue
        elif dirn == os.path.basename(ap.params.unaligned_dir):
            continue
        # Try to load data from the directory
        try:
            illumina_data = IlluminaData(ap.analysis_dir,
                                         unaligned_dir=dirn)
            extra_bcl2fastq_dirs.append(dirn)
        except Exception:
            pass
    if not is_staging:
        # Are there any projects to archive?
        try:
            projects = ap.get_analysis_projects()
        except Exception as ex:
            logging.warning("Error trying to fetch analysis projects: "
                            "%s" % ex)
            projects = []
        if not projects:
            if not force:
                raise Exception("No project directories found, nothing "
                                "to archive")
            # Check if there is a bcl2fastq directory instead
            unaligned_dir = ap.params.unaligned_dir
            if not os.path.isabs(unaligned_dir):
                unaligned_dir = os.path.join(ap.analysis_dir,
                                             unaligned_dir)
            if os.path.exists(unaligned_dir):
                logging.warning("No project directories found, forcing "
                                "archiving of bcl2fastq output directory "
                                "'%s' instead" % ap.params.unaligned_dir)
                include_bcl2fastq = True
            else:
                raise Exception("No project directories or bcl2fastq "
                                "directory output found, nothing to "
                                "archive (even with --force)")
        # Determine which directories to exclude
        excludes = ['--exclude=primary_data',
                    '--exclude=save.*',
                    '--exclude=*.bak',
                    '--exclude=*.tmp',
                    '--exclude=tmp.*',
                    '--exclude=__*',]
        if not include_bcl2fastq:
            # Determine whether bcl2fastq dir should be included implicitly
            # because there are links from the analysis directories
            for project in projects:
                if project.fastqs_are_symlinks:
                    print("Found at least one project with fastq "
                          "symlinks (%s)" % project.name)
                    include_bcl2fastq = True
                    break
        if not include_bcl2fastq:
            print("Excluding '%s' directory from archive" %
                  ap.params.unaligned_dir)
            excludes.append('--exclude=%s' % ap.params.unaligned_dir)
        # Exclude extra bcl2fastq dirs
        for dirn in extra_bcl2fastq_dirs:
            print("Excluding '%s' directory from archive" % dirn)
            excludes.append('--exclude=%s' % dirn)
        # 10xgenomics products to exclude
        excludes.append('--exclude=*.mro')
        excludes.append('--exclude=%s*' %
                        tenx_genomics_utils.flow_cell_id(ap.run_name))
        # Log dir
        log_dir = 'archive%s' % ('_final' if final else '_staging')
        if dry_run:
            log_dir += '_dry_run'
        ap.set_log_dir(ap.get_log_subdir(log_dir))
        # Set up runner
        if runner is None:
            runner = ap.settings.runners.rsync
        runner.set_log_dir(ap.log_dir)
        # Setup a scheduler for multiple rsync jobs
        sched = simple_scheduler.SimpleScheduler(
            runner=runner,
            max_concurrent=ap.settings.general.max_concurrent_jobs,
            poll_interval=ap.settings.general.poll_interval)
        sched.start()
        # Keep track of jobs
        archiving_jobs = []
        # If making fastqs read-only then transfer them separately
        if read_only_fastqs and final:
            # Make sure excluded directories are excluded
            extra_options =  [ex for ex in excludes]
            # Set up to include only the fastq directories in
            # projects
            fastq_dirs = []
            for project in projects:
                for fastq_dir in project.fastq_dirs:
                    fastq_dirs.append(os.path.join(
                        os.path.basename(project.dirn),
                        fastq_dir))
            # Update the extra options with includes/excludes
            extra_options.append('--include=*/')
            for fastq_dir in fastq_dirs:
                extra_options.append('--include=%s/**' % fastq_dir)
            extra_options.append('--exclude=*')
            # Execute the rsync
            rsync_fastqs = applications.general.rsync(
                "%s/" % ap.analysis_dir,
                os.path.join(archive_dir,staging),
                prune_empty_dirs=False,
                mirror=True,
                dry_run=dry_run,
                chmod='ugo-w',
                extra_options=extra_options)
            print("Running %s" % rsync_fastqs)
            rsync_fastqs_job = sched.submit(rsync_fastqs,
                                            name="rsync.archive_fastqs")
            # Exclude fastqs from main rsync
            for fastq_dir in fastq_dirs:
                excludes.append('--exclude=%s' % fastq_dir)
            wait_for = [rsync_fastqs_job.job_name]
            # Add to list of jobs
            archiving_jobs.append(rsync_fastqs_job)
        else:
            # No separate Fastq rsync
            rsync_fastqs_job = None
            wait_for = ()
        # Main rsync command
        rsync = applications.general.rsync(
            "%s/" % ap.analysis_dir,
            os.path.join(archive_dir,staging),
            prune_empty_dirs=True,
            mirror=True,
            dry_run=dry_run,
            chmod=perms,
            extra_options=excludes)
        print("Running %s" % rsync)
        rsync_job = sched.submit(rsync,name="rsync.archive",
                                 wait_for=wait_for)
        archiving_jobs.append(rsync_job)
        # Wait for jobs to complete
        rsync_job.wait()
        # Check exit status on jobs
        for job in archiving_jobs:
            print("%s completed: exit code %s" % (job.name,
                                                  job.exit_code))
        retval = sum([j.exit_code for j in archiving_jobs])
        if retval != 0:
            logger.warning("One or more archiving jobs failed "
                           "(non-zero exit code returned)")
        else:
            if final:
                # Update the final stored Fastq paths for QC
                staged_analysis_dir = os.path.join(
                    archive_dir,
                    staging)
                archived_analysis_dir = os.path.abspath(
                    os.path.join(
                        archive_dir,
                        final_dest))
                for project in AnalysisDir(staged_analysis_dir).get_projects():
                    qc_info = project.qc_info(project.qc_dir)
                    if qc_info.fastq_dir:
                        print("%s: updating stored Fastq directory for QC" %
                              project.name)
                        new_fastq_dir = os.path.join(archived_analysis_dir,
                                                     os.path.relpath(
                                                         qc_info.fastq_dir,
                                                         ap.analysis_dir))
                        print("-- updated Fastq directory: %s" % new_fastq_dir)
                        qc_info['fastq_dir'] = new_fastq_dir
                        qc_info.save()
            # Set the group
            if group is not None:
                print("Setting group of archived files to '%s'" % group)
                if not dry_run:
                    set_group = fileops.set_group_command(
                        group,
                        os.path.join(archive_dir,staging),
                        safe=force,
                        verbose=True)
                    print("Running %s" % set_group)
                    set_group_job = sched.submit(
                        set_group,
                        name="set_group.archive")
                    set_group_job.wait()
                    # Check exit status
                    exit_code = set_group_job.exit_code
                    print("%s completed: exit code %s" % (
                        set_group_job.name,
                        exit_code))
                    if exit_code != 0:
                        logger.warning("Setting group failed (non-zero "
                                       "exit status code returned)")
                    retval = retval + exit_code
        # Finish with scheduler
        sched.wait()
        sched.stop()
        # Bail out if there was a problem
        if retval != 0:
            if not force:
                raise Exception("Staging to archive failed")
            else:
                logger.warning("Staging to archive failed (ignored)")
    # Move to final location
    if final:
        print("Moving to final location: %s" % final_dest)
        if not dry_run:
            fileops.rename(os.path.join(archive_dir,staging),
                           os.path.join(archive_dir,final_dest))
    # Report usage of target filesystem
    usage = fileops.disk_usage(archive_dir)
    print("Usage of archive: %s available (of %s) (%s%% in use)" %
          (format_file_size(usage.free),
           format_file_size(usage.total),
           usage.percent))
    # Finish
    return retval
 # Look for fastqs
 fastqs = []
 for data in sequencing_data:
     print "%s" % data.unaligned_dir
     fastqs.extend(get_fastqs(data,project_pattern=project_pattern,
                              sample_pattern=sample_pattern))
 if not fastqs:
     logging.error("No matching Fastqs found")
     sys.exit(1)
 # Report file sizes
 total_size = 0
 for fq in fastqs:
     fsize = os.lstat(fq).st_size
     total_size += fsize
     print "%s\t%s" % (os.path.basename(fq),
                       bcf_utils.format_file_size(fsize))
 print "Total: %s" % bcf_utils.format_file_size(total_size)
 # Generate MD5 checksum file
 if not options.dry_run:
     tmpdir = tempfile.mkdtemp(suffix='checksums.md5',
                               dir=os.getcwd())
     md5_file = os.path.join(tmpdir,'checksums.md5')
     print "Generating MD5 sums in %s" % md5_file
     fp = open(md5_file,'w')
     for fq in fastqs:
         chksum = Md5sum.md5sum(fq)
         fp.write("%s  %s\n" % (chksum,os.path.basename(fq)))
     fp.close()
 # Copy the fastqs
 print "Copying fastqs"
 for fq in fastqs:
Example #10
0
     total_size = 0
     n_fastqs = 0
     sample_names = set()
     # Collect information
     fastq_set = os.path.relpath(project.fastq_dir, project.dirn)
     print "Fastq set: %s%s" % (
         ("default" if fastq_set == "fastqs" else fastq_set),
         (" (primary)"
          if fastq_set == project.info.primary_fastq_dir else ""))
     for sample_name, fastq, fq in get_fastqs(project,
                                              pattern=options.pattern):
         # File size
         fsize = os.lstat(fq).st_size
         print "%s\t%s%s\t%s" % (sample_name, os.path.basename(fq),
                                 ('*' if os.path.islink(fastq) else ''),
                                 bcf_utils.format_file_size(fsize))
         sample_names.add(sample_name)
         total_size += fsize
         n_fastqs += 1
     # Summary
     print "Total:\t%s" % bcf_utils.format_file_size(total_size)
     print "%d %ssamples" % (len(sample_names),
                             ('paired-end '
                              if project.info.paired_end else ''))
     print "%d fastqs" % n_fastqs
     sys.exit(0)
 # Perform command
 if cmd not in ('copy', 'zip', 'md5'):
     p.error("Unrecognised command '%s'\n" % cmd)
     sys.exit(1)
 if cmd == 'copy':
Example #11
0
    # Report the names of the samples in each project
    if report:
        for project in illumina_data.projects:
            print("%s" % IlluminaData.describe_project(project))
            # Report statistics for fastq files
            if args.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn, fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print(
                            "%s\t%s\t%d" %
                            (fastq, bcf_utils.format_file_size(fsize), nreads))
            print("")

    # Summary: short report suitable for logging file
    if args.summary:
        print("%s" % IlluminaData.summarise_projects(illumina_data))

    # Print number of undetermined reads
    if args.stats and illumina_data.undetermined is not None:
        print("Undetermined indices")
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn, fastq)
                nreads = FASTQFile.nreads(fq)
                fsize = os.path.getsize(fq)
                print("%s\t%s\t%d" %
Example #12
0
def main():
    """
    """
    # Load configuration
    settings = Settings()

    # Collect defaults
    default_runner = settings.runners.rsync

    # Get pre-defined destinations
    destinations = [name for name in settings.destination]

    # Command line
    p = argparse.ArgumentParser(
        description="Transfer copies of Fastq data from an analysis "
        "project to an arbitrary destination for sharing with other "
        "people")
    p.add_argument('--version',
                   action='version',
                   version=("%%(prog)s %s" % get_version()))
    p.add_argument('--subdir',
                   action='store',
                   choices=('random_bin', 'run_id'),
                   default=None,
                   help="subdirectory naming scheme: 'random_bin' "
                   "locates a random pre-existing empty subdirectory "
                   "under the target directory; 'run_id' creates a "
                   "new subdirectory "
                   "'PLATFORM_DATESTAMP.RUN_ID-PROJECT'. If this "
                   "option is not set then no subdirectory will be "
                   "used")
    p.add_argument('--readme',
                   action='store',
                   metavar='README_TEMPLATE',
                   dest='readme_template',
                   help="template file to generate README file from; "
                   "can be full path to a template file, or the name "
                   "of a file in the 'templates' directory")
    p.add_argument('--weburl',
                   action='store',
                   help="base URL for webserver (sets the value of "
                   "the WEBURL variable in the template README)")
    p.add_argument('--include_downloader',
                   action='store_true',
                   help="copy the 'download_fastqs.py' utility to the "
                   "final location")
    p.add_argument('--include_qc_report',
                   action='store_true',
                   help="copy the zipped QC reports to the final "
                   "location")
    p.add_argument('--include_10x_outputs',
                   action='store_true',
                   help="copy outputs from 10xGenomics pipelines (e.g. "
                   "'cellranger count') to the final location")
    p.add_argument('--link',
                   action='store_true',
                   help="hard link files instead of copying")
    p.add_argument('--runner',
                   action='store',
                   help="specify the job runner to use for executing "
                   "the checksumming, Fastq copy and tar gzipping "
                   "operations (defaults to job runner defined for "
                   "copying in config file [%s])" % default_runner)
    p.add_argument('dest',
                   action='store',
                   metavar="DEST",
                   help="destination to copy Fastqs to; can be the "
                   "name of a destination defined in the configuration "
                   "file, or an arbitrary location of the form "
                   "'[[USER@]HOST:]DIR' (%s)" %
                   (("available destinations: %s" %
                     (','.join("'%s'" % d for d in sorted(destinations))))
                    if destinations else "no destinations currently defined"))
    p.add_argument('project',
                   action='store',
                   metavar="PROJECT",
                   help="path to project directory (or to a Fastqs "
                   "subdirectory in a project) to copy Fastqs from")

    # Process command line
    args = p.parse_args()

    # Check if target is pre-defined destination
    if args.dest in destinations:
        print("Loading settings for destination '%s'" % args.dest)
        dest = settings.destination[args.dest]
        target_dir = dest.directory
        readme_template = dest.readme_template
        subdir = dest.subdir
        include_downloader = dest.include_downloader
        include_qc_report = dest.include_qc_report
        hard_links = dest.hard_links
        weburl = dest.url
    else:
        target_dir = args.dest
        readme_template = None
        subdir = None
        include_downloader = False
        include_qc_report = False
        hard_links = False
        weburl = None

    # Update defaults with command line values
    if args.readme_template:
        readme_template = args.readme_template
    if args.subdir:
        subdir = args.subdir
    if args.include_downloader:
        include_downloader = True
    if args.include_qc_report:
        include_qc_report = True
    if args.weburl:
        weburl = args.weburl
    if args.link:
        hard_links = args.link

    # Sort out project directory
    project = AnalysisProject(args.project)
    if not project.is_analysis_dir:
        # Assume it's the Fastq dir
        fastq_dir = os.path.basename(args.project)
        project = AnalysisProject(os.path.dirname(args.project))
    else:
        fastq_dir = None
    if not project.is_analysis_dir:
        logger.error("'%s': project not found" % args.project)
        return 1
    project_name = project.name

    # Parent analysis directory
    analysis_dir = AnalysisDir(os.path.dirname(project.dirn))

    # Fastqs directory
    try:
        project.use_fastq_dir(fastq_dir)
    except Exception as ex:
        logger.error("'%s': failed to load Fastq set '%s': %s" %
                     (project.name, fastq_dir, ex))
        return 1

    # Report
    print("Transferring data from '%s' (%s)" % (project.name, project.dirn))
    print("Fastqs in %s" % project.fastq_dir)

    # Summarise samples and Fastqs
    samples = set()
    nfastqs = 0
    fsize = 0
    for sample in project.samples:
        samples.add(sample.name)
        for fq in sample.fastq:
            fsize += os.lstat(fq).st_size
            nfastqs += 1
    nsamples = len(samples)
    dataset = "%s%s dataset" % ("%s " % project.info.single_cell_platform
                                if project.info.single_cell_platform else '',
                                project.info.library_type)
    endedness = "paired-end" if project.info.paired_end else "single-end"
    print("%s with %d Fastqs from %d %s sample%s totalling %s" %
          (dataset, nfastqs, nsamples, endedness, 's' if nsamples != 1 else '',
           format_file_size(fsize)))

    # Check target dir
    if not Location(target_dir).is_remote:
        target_dir = os.path.abspath(target_dir)
    if not exists(target_dir):
        print("'%s': target directory not found" % target_dir)
        return
    else:
        print("Target directory %s" % target_dir)

    # Locate downloader
    if include_downloader:
        print("Locating downloader for inclusion")
        downloader = find_program("download_fastqs.py")
        if downloader is None:
            logging.error("Unable to locate download_fastqs.py")
            return 1
        print("... found %s" % downloader)
    else:
        downloader = None

    # Locate zipped QC report
    if include_qc_report:
        print("Locating zipped QC reports for inclusion")
        qc_zips = list()
        # Check QC directories and look for zipped reports
        for qc_dir in project.qc_dirs:
            # Get the associated Fastq set
            # NB only compare the basename of the Fastq dir
            # in case full paths weren't updated
            fq_set = os.path.basename(project.qc_info(qc_dir).fastq_dir)
            if fq_set == os.path.basename(project.fastq_dir):
                for qc_base in (
                        "%s_report.%s.%s" %
                    (qc_dir, project.name, project.info.run),
                        "%s_report.%s.%s" %
                    (qc_dir, project.name,
                     os.path.basename(analysis_dir.analysis_dir)),
                ):
                    qc_zip = os.path.join(project.dirn, "%s.zip" % qc_base)
                    if os.path.exists(qc_zip):
                        print("... found %s" % qc_zip)
                        qc_zips.append(qc_zip)
        if not qc_zips:
            logger.error("No zipped QC reports found")
            return 1
    else:
        qc_zips = None

    # Locate 10xGenomics outputs
    if args.include_10x_outputs:
        print("Locating outputs from 10xGenomics pipelines for " "inclusion")
        cellranger_dirs = list()
        for d in (
                'cellranger_count',
                'cellranger_multi',
        ):
            cellranger_dir = os.path.join(project.dirn, d)
            if os.path.isdir(cellranger_dir):
                print("... found %s" % cellranger_dir)
                cellranger_dirs.append(cellranger_dir)
        if not cellranger_dirs:
            logger.error("No outputs from 10xGenomics pipelines found")
            return 1
    else:
        cellranger_dirs = None

    # Determine subdirectory
    if subdir == "random_bin":
        # Find a random empty directory under the
        # target directory
        print("Locating random empty bin")
        subdirs = [
            d for d in os.listdir(target_dir)
            if os.path.isdir(os.path.join(target_dir, d))
        ]
        if not subdirs:
            print("Failed to locate subdirectories")
            return
        shuffle(subdirs)
        subdir = None
        for d in subdirs:
            if not os.listdir(os.path.join(target_dir, d)):
                # Empty bin
                subdir = d
                break
        if subdir is None:
            print("Failed to locate empty subdirectory")
            return
        print("... found '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)
    elif subdir == "run_id":
        # Construct subdirectory name based on the
        # run ID
        subdir = "{platform}_{datestamp}.{run_number}-{project}".format(
            platform=analysis_dir.metadata.platform.upper(),
            datestamp=analysis_dir.metadata.instrument_datestamp,
            run_number=analysis_dir.metadata.run_number,
            project=project.name)
        # Check it doesn't already exist
        if exists(os.path.join(target_dir, subdir)):
            logger.error("'%s': subdirectory already exists" % subdir)
            return
        print("Using subdirectory '%s'" % subdir)
        # Update target dir
        target_dir = os.path.join(target_dir, subdir)

    # Make target directory
    if not exists(target_dir):
        mkdir(target_dir)

    # Get runner for copy job
    if args.runner:
        runner = fetch_runner(args.runner)
    else:
        runner = default_runner

    # Set identifier for jobs
    job_id = "%s%s" % (project_name,
                       (".%s" % fastq_dir if fastq_dir is not None else ''))

    # Set the working directory
    working_dir = os.path.abspath("transfer.%s.%s" %
                                  (job_id, int(time.time())))
    mkdir(working_dir)
    print("Created working dir %s" % working_dir)

    # Construct the README
    if readme_template:
        # Check that template file exists
        print("Locating README template")
        template = None
        for filen in (
                readme_template,
                os.path.join(get_templates_dir(), readme_template),
        ):
            if os.path.exists(filen):
                template = filen
                break
        if template is None:
            logger.error("'%s': template file not found" % readme_template)
            return 1
        else:
            readme_template = template
        print("... found %s" % readme_template)
        # Read in template
        with open(readme_template, 'rt') as fp:
            readme = fp.read()
        # Substitute template variables
        template_vars = {
            'PLATFORM': analysis_dir.metadata.platform.upper(),
            'RUN_NUMBER': analysis_dir.metadata.run_number,
            'DATESTAMP': analysis_dir.metadata.instrument_datestamp,
            'PROJECT': project_name,
            'WEBURL': weburl,
            'BIN': subdir,
            'DIR': target_dir,
            'TODAY': date.today().strftime("%d/%m/%Y"),
        }
        for var in template_vars:
            value = template_vars[var]
            if value is None:
                value = '?'
            else:
                value = str(value)
            readme = re.sub(r"%{var}%".format(var=var), value, readme)
        # Write out a temporary README file
        readme_file = os.path.join(working_dir, "README")
        with open(readme_file, 'wt') as fp:
            fp.write(readme)
    else:
        # No README
        readme_file = None

    # Start a scheduler to run jobs
    sched = SimpleScheduler(runner=runner,
                            reporter=TransferDataSchedulerReporter(),
                            poll_interval=settings.general.poll_interval)
    sched.start()

    # Build command to run manage_fastqs.py
    copy_cmd = Command("manage_fastqs.py")
    if hard_links:
        copy_cmd.add_args("--link")
    copy_cmd.add_args(analysis_dir.analysis_dir, project_name)
    if fastq_dir is not None:
        copy_cmd.add_args(fastq_dir)
    copy_cmd.add_args("copy", target_dir)
    print("Running %s" % copy_cmd)
    copy_job = sched.submit(copy_cmd.command_line,
                            name="copy.%s" % job_id,
                            wd=working_dir)

    # Copy README
    if readme_file is not None:
        print("Copying README file")
        copy_cmd = copy_command(readme_file,
                                os.path.join(target_dir, "README"))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.readme" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy download_fastqs.py
    if downloader:
        print("Copying downloader")
        copy_cmd = copy_command(
            downloader, os.path.join(target_dir, os.path.basename(downloader)))
        sched.submit(copy_cmd.command_line,
                     name="copy.%s.downloader" % job_id,
                     runner=SimpleJobRunner(),
                     wd=working_dir)

    # Copy QC reports
    if qc_zips:
        for qc_zip in qc_zips:
            print("Copying '%s'" % os.path.basename(qc_zip))
            copy_cmd = copy_command(qc_zip,
                                    os.path.join(target_dir,
                                                 os.path.basename(qc_zip)),
                                    link=hard_links)
            sched.submit(copy_cmd.command_line,
                         name="copy.%s.%s" %
                         (job_id, os.path.basename(qc_zip)),
                         runner=SimpleJobRunner(),
                         wd=working_dir)

    # Tar and copy 10xGenomics outputs
    if cellranger_dirs:
        for cellranger_dir in cellranger_dirs:
            print("Tar gzipping and copying '%s'" %
                  os.path.basename(cellranger_dir))
            # Tar & gzip data
            targz = os.path.join(
                working_dir,
                "%s.%s.%s.tgz" % (os.path.basename(cellranger_dir),
                                  project_name, project.info.run))
            targz_cmd = Command("tar", "czvhf", targz, "-C",
                                os.path.dirname(cellranger_dir),
                                os.path.basename(cellranger_dir))
            print("Running %s" % targz_cmd)
            targz_job = sched.submit(
                targz_cmd.command_line,
                name="targz.%s.%s" %
                (job_id, os.path.basename(cellranger_dir)),
                wd=working_dir)
            # Copy the targz file
            copy_cmd = copy_command(
                targz, os.path.join(target_dir, os.path.basename(targz)))
            print("Running %s" % copy_cmd)
            copy_job = sched.submit(copy_cmd.command_line,
                                    name="copytgz.%s.%s" %
                                    (job_id, os.path.basename(cellranger_dir)),
                                    runner=SimpleJobRunner(),
                                    wd=working_dir,
                                    wait_for=(targz_job.job_name, ))

    # Wait for scheduler jobs to complete
    sched.wait()

    # Check exit code for Fastq copying
    exit_code = copy_job.exit_code
    if exit_code != 0:
        logger.error("File copy exited with an error")
        return exit_code
    else:
        print("Files now at %s" % target_dir)
        if weburl:
            url = weburl
            if subdir is not None:
                url = os.path.join(url, subdir)
            print("URL: %s" % url)
        print("Done")
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn,fastq)
                nreads = FASTQFile.nreads(fq)
                fsize = os.path.getsize(fq)
Example #14
0
def list_files(datadir,
               extensions=None,
               owners=None,
               groups=None,
               compression=None,
               subdir=None,
               sort_keys=None,
               min_size=None,
               fields=('owner', 'group', 'relpath', 'size'),
               delimiter='\t'):
    """
    Report files owned by specific users and/or groups

    'fields' is a list of attributes to display for each file, in
    the specified order. The available fields are:

    'owner'   - User who owns the file
    'group'   - Group the file belongs to
    'path'    - Full path
    'relpath' - Relative path
    'size'    - File size (human readable)

    """
    # Check the fields
    for field in fields:
        if field not in (
                'owner',
                'group',
                'path',
                'relpath',
                'size',
        ):
            raise Exception("Unrecognised field: '%s'" % field)
    # Collect files and report
    nfiles = 0
    total_size = 0
    if min_size: min_size = convert_size(min_size)
    for f in DataDir(datadir).files(extensions=extensions,
                                    compression=compression,
                                    owners=owners,
                                    groups=groups,
                                    subdir=subdir,
                                    sort_keys=sort_keys):
        if min_size and f.size < min_size: continue
        total_size += f.size
        nfiles += 1
        # Assemble line from fields
        line = []
        for field in fields:
            if field == 'owner':
                line.append(f.user)
            elif field == 'group':
                line.append(f.group)
            elif field == 'path':
                line.append("%s%s" % (f.path, f.classifier))
            elif field == 'relpath':
                line.append("%s%s" % (f.relpath(datadir), f.classifier))
            elif field == 'size':
                line.append(utils.format_file_size(f.size))
        print delimiter.join([str(x) for x in line])
    if not nfiles:
        print "No files found"
        return
    print "%d found, total size: %s" % (nfiles,
                                        utils.format_file_size(total_size))
     for project in unassigned:
         print("%s: %s" % (project.name, project.info.run))
     sys.exit(0)
 # Report if no PIs were found
 if len(pi_list) == 0:
     print("No projects assigned to PIs found")
     sys.exit(0)
 # Report PIs, projects etc
 print("Summary (PI, # of projects, total usage):")
 print("=========================================")
 total_projects = 0
 total_size = 0
 for pi in pi_list:
     n_projects = len(audit_data[pi])
     size = sum([p[1] for p in audit_data[pi]])
     print("%s\t%d\t%s" % (pi, n_projects, utils.format_file_size(size)))
     total_projects += n_projects
     total_size += size
 print("Total usage\t%d\t%s" %
       (total_projects, utils.format_file_size(total_size)))
 print("\nBreakdown by PI/project:")
 print("========================")
 for pi in pi_list:
     print("%s:" % pi)
     for project, size in audit_data[pi]:
         print(
             "\t%s:\t%s\t%s" %
             (project.info.run, project.name, utils.format_file_size(size)))
 if undetermined:
     print("\nUsage for 'undetermined' reads:")
     print("===============================")
Example #16
0
 def _get_data(self, filen=None):
     """
     Collect statistics for FASTQ outputs from an Illumina run
     """
     # Collect FASTQ files
     fastqstats = []
     for project in self._illumina_data.projects:
         for sample in project.samples:
             for fastq in sample.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(sample.dirn, fastq),
                                project.name, sample.name))
     # Gather same information for undetermined reads (if present)
     if self._illumina_data.undetermined is not None:
         for lane in self._illumina_data.undetermined.samples:
             for fastq in lane.fastq:
                 fastqstats.append(
                     FastqStats(os.path.join(lane.dirn, fastq),
                                self._illumina_data.undetermined.name,
                                lane.name))
     # Collect the data for each file
     if self._n_processors > 1:
         # Multiple cores
         pool = Pool(self._n_processors)
         results = pool.map(collect_fastq_data, fastqstats)
         pool.close()
         pool.join()
     else:
         # Single core
         results = map(collect_fastq_data, fastqstats)
     # Set up tabfile to hold pre-existing data
     if filen is not None:
         existing_stats = TabFile(filen, first_line_is_header=True)
     else:
         existing_stats = None
     # Set up class to hold all collected data
     self._stats = TabFile(column_names=('Project', 'Sample', 'Fastq',
                                         'Size', 'Nreads', 'Paired_end',
                                         'Read_number'))
     # Split result sets into R1 and R2
     results_r1 = filter(lambda f: f.read_number == 1, results)
     results_r2 = filter(lambda f: f.read_number == 2, results)
     # Determine which lanes are present and append
     # columns for each
     lanes = set()
     for fastq in results_r1:
         logger.debug("-- %s: lanes %s" %
                      (fastq.name, ','.join([str(l) for l in fastq.lanes])))
         for lane in fastq.lanes:
             lanes.add(lane)
     # Add lane numbers from pre-existing stats file
     if existing_stats is not None:
         for c in existing_stats.header():
             if c.startswith('L'):
                 lanes.add(int(c[1:]))
     self._lanes = sorted(list(lanes))
     logger.debug("Lanes found: %s" %
                  ','.join([str(l) for l in self._lanes]))
     for lane in self._lanes:
         self._stats.appendColumn("L%s" % lane)
     # Copy pre-existing stats into new tabfile
     if existing_stats:
         for line in existing_stats:
             data = [
                 line['Project'], line['Sample'], line['Fastq'],
                 line['Size'], line['Nreads'], line['Paired_end'],
                 line['Read_number']
             ]
             for lane in lanes:
                 try:
                     data.append(line["L%s" % lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
     # Copy reads per lane from R1 FASTQs into R2
     for r2_fastq in results_r2:
         # Get corresponding R1 name
         logger.debug("-- Fastq R2: %s" % r2_fastq.name)
         r1_fastq_name = IlluminaFastq(r2_fastq.name)
         r1_fastq_name.read_number = 1
         r1_fastq_name = str(r1_fastq_name)
         logger.debug("--    -> R1: %s" % r1_fastq_name)
         # Locate corresponding data
         r1_fastq = filter(lambda f: f.name.startswith(r1_fastq_name),
                           results_r1)[0]
         r2_fastq.reads_by_lane = dict(r1_fastq.reads_by_lane)
     # Write the data into the tabfile
     paired_end = ('Y' if self._illumina_data.paired_end else 'N')
     for fastq in results:
         # Check for existing entry
         existing_entry = False
         for line in self._stats:
             if (line['Project'] == fastq.project
                     and line['Sample'] == fastq.sample
                     and line['Fastq'] == fastq.name):
                 # Overwrite the existing entry
                 existing_entry = True
                 break
         # Write the data
         if not existing_entry:
             # Append new entry
             data = [
                 fastq.project, fastq.sample, fastq.name,
                 bcf_utils.format_file_size(fastq.fsize), fastq.nreads,
                 paired_end, fastq.read_number
             ]
             for lane in lanes:
                 try:
                     data.append(fastq.reads_by_lane[lane])
                 except:
                     data.append('')
             self._stats.append(data=data)
         else:
             # Overwrite existing entry
             logging.warning("Overwriting exisiting entry for "
                             "%s/%s/%s" %
                             (fastq.project, fastq.sample, fastq.name))
             line['Size'] = bcf_utils.format_file_size(fastq.fsize)
             line['Nreads'] = fastq.nreads
             line['Paired_end'] = paired_end
             line['Read_number'] = fastq.read_number
             for lane in lanes:
                 lane_name = "L%d" % lane
                 try:
                     line[lane_name] = fastq.reads_by_lane[lane]
                 except:
                     line[lane_name] = ''
         print "%s: %s" % (project.name,project.info.run)
     sys.exit(0)
 # Report if no PIs were found
 if len(pi_list) == 0:
     print "No projects assigned to PIs found"
     sys.exit(0)
 # Report PIs, projects etc
 print "Summary (PI, # of projects, total usage):"
 print "========================================="
 total_projects = 0
 total_size = 0
 for pi in pi_list:
     n_projects = len(audit_data[pi])
     size = sum([p[1] for p in audit_data[pi]])
     print "%s\t%d\t%s" % (pi,n_projects,
                           utils.format_file_size(size))
     total_projects += n_projects
     total_size += size
 print "Total usage\t%d\t%s" % (total_projects,
                                utils.format_file_size(total_size))
 print "\nBreakdown by PI/project:"
 print "========================"
 for pi in pi_list:
     print "%s:" % pi
     for project,size in audit_data[pi]:
         print "\t%s:\t%s\t%s" % (project.info.run,project.name,
                                  utils.format_file_size(size))
 if undetermined:
     print "\nUsage for 'undetermined' reads:"
     print "==============================="
     total_size = 0
         print "%s: %s" % (project.name,project.info.run)
     sys.exit(0)
 # Report if no PIs were found
 if len(pi_list) == 0:
     print "No projects assigned to PIs found"
     sys.exit(0)
 # Report PIs, projects etc
 print "Summary (PI, # of projects, total usage):"
 print "========================================="
 total_projects = 0
 total_size = 0
 for pi in pi_list:
     n_projects = len(audit_data[pi])
     size = sum([p[1] for p in audit_data[pi]])
     print "%s\t%d\t%s" % (pi,n_projects,
                           utils.format_file_size(size))
     total_projects += n_projects
     total_size += size
 print "Total usage\t%d\t%s" % (total_projects,
                                utils.format_file_size(total_size))
 print "\nBreakdown by PI/project:"
 print "========================"
 for pi in pi_list:
     print "%s:" % pi
     for project,size in audit_data[pi]:
         print "\t%s:\t%s\t%s" % (project.info.run,project.name,
                                  utils.format_file_size(size))
 if undetermined:
     print "\nUsage for 'undetermined' reads:"
     print "==============================="
     total_size = 0
Example #19
0
                    print "\t\t%s" % fastq

    # Report the names of the samples in each project
    if options.report:
        for project in illumina_data.projects:
            print "%s" % IlluminaData.describe_project(project)
            # Report statistics for fastq files
            if options.stats:
                # Print number of reads for each file, and file size
                for sample in project.samples:
                    for fastq in sample.fastq:
                        fq = os.path.join(sample.dirn,fastq)
                        nreads = FASTQFile.nreads(fq)
                        fsize = os.path.getsize(fq)
                        print "%s\t%s\t%d" % (fastq,
                                              bcf_utils.format_file_size(fsize),
                                              nreads)
            print ""

    # Summary: short report suitable for logging file
    if options.summary:
        print "%s" % IlluminaData.summarise_projects(illumina_data)

    # Print number of undetermined reads
    if options.stats and illumina_data.undetermined is not None:
        print "Undetermined indices"
        for lane in illumina_data.undetermined.samples:
            for fastq in lane.fastq:
                fq = os.path.join(lane.dirn,fastq)
                nreads = FASTQFile.nreads(fq)
                fsize = os.path.getsize(fq)