Exemple #1
0
    def get_md5sums(self):
        """
        Generate MD5sums

        Generate and return MD5 sums for the file and
        for the uncompressed contents.

        Sets the 'md5' and 'uncompressed_md5' properties on
        the current instance.

        Returns tuple (md5,md5_uncompressed_contents).

        """
        if self.is_link or self.is_dir:
            # Ignore links or directories
            return (None, None)
        if self.md5 is None:
            # Generate MD5 sum
            self.md5 = Md5sum.md5sum(self.path)
        if self.uncompressed_md5 is None:
            # Generate MD5 for uncompressed contents
            if not self.compression:
                self.uncompressed_md5 = self.md5
            elif self.compression == 'bz2':
                fp = bz2.BZ2File(self.path, 'r')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            elif self.compression == 'gz':
                fp = gzip.GzipFile(self.path, 'rb')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            else:
                logging.warning("%s: md5sums not implemented for "
                                "compression type '%s'" %
                                (self, self.compression))
        return (self.md5, self.uncompressed_md5)
Exemple #2
0
    def get_md5sums(self):
        """
        Generate MD5sums

        Generate and return MD5 sums for the file and
        for the uncompressed contents.

        Sets the 'md5' and 'uncompressed_md5' properties on
        the current instance.

        Returns tuple (md5,md5_uncompressed_contents).

        """
        if self.is_link or self.is_dir:
            # Ignore links or directories
            return (None,None)
        if self.md5 is None:
            # Generate MD5 sum
            self.md5 = Md5sum.md5sum(self.path)
        if self.uncompressed_md5 is None:
            # Generate MD5 for uncompressed contents
            if not self.compression:
                self.uncompressed_md5 = self.md5
            elif self.compression == 'bz2':
                fp = bz2.BZ2File(self.path,'r')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            elif self.compression == 'gz':
                fp = gzip.GzipFile(self.path,'rb')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            else:
                logging.warning("%s: md5sums not implemented for "
                                "compression type '%s'"
                                % (self,self.compression))
        return (self.md5,self.uncompressed_md5)
Exemple #3
0
def verify_md5sums(chksum_file,verbose=False):
    """Check the MD5 sums for all entries specified in a file

    For all entries in the supplied file, check the MD5 sum is
    the same as that calculated by the function, and report
    whether they match or are different.

    The input file can either be output from this program or
    from the Linux 'md5sum' program.

    Arguments:
      chksum_file: name of the file containing the MD5 sums
      verbose: (optional) if True then report status for all
        files checked, plus a summary; otherwise only report
        failures

    Returns:
      Zero on success, 1 if errors were encountered

    """
    # Set up reporter object
    reporter = Md5sum.Md5CheckReporter(Md5sum.Md5Checker.verify_md5sums(chksum_file),
                                       verbose=verbose)
    # Summarise
    if verbose: reporter.summary()
    return reporter.status
def compute_md5sum_for_file(filen,output_file=None):
    """Compute and write MD5 sum for specifed file

    Computes the MD5 sum for a file, and writes the sum and the file
    name either to stdout or to the specified file name.

    Note that the output format is compatible with the Linux
    'md5sum' program's '-c' option.

    Arguments:
      filen: file to compute the MD5 sum for
      output_file: (optional) name of file to write MD5 sum to

    Returns:
      Zero on success, 1 if errors were encountered

    """
    retval = 1
    if output_file:
        fp = open(output_file,'w')
    else:
        fp = sys.stdout
    try:
        chksum = Md5sum.md5sum(filen)
        fp.write("%s  %s\n" % (chksum,filen))
    except IOError, ex:
        # Error accessing file, report and skip
        logging.error("%s: error while generating MD5 sum: '%s'" % (filen,ex))
        retval = 1
def write_checksums(project, pattern=None, filen=None, relative=True):
    """Write MD5 checksums for fastq files with an AnalysisProject

    Arguments:
      project: AnalysisProject instance
      pattern: if supplied then use the supplied pattern
        to filter fastqs based on filename
      filen: if supplied then checksums will be written
        to this file; otherwise they will be written to
        stdout (default)
      relative: if True (default) then fastq file names
        will be the basename; otherwise they will be the
        full paths.

    """
    if filen:
        fp = open(md5file, 'w')
    else:
        fp = sys.stdout
    for sample_name, fastq, fq in get_fastqs(project, pattern=pattern):
        if relative:
            name = os.path.basename(fq)
        else:
            name = fq
        fp.write("%s  %s\n" % (md5sum.md5sum(fq), name))
    if filen:
        fp.close()
Exemple #6
0
def diff_files(filen1,filen2,verbose=False):
    """Check that the MD5 sums of two files match

    This compares two files by computing the MD5 sums for each.

    Arguments:
      filen1: "source" file
      filen2: "target" file to be compared with filen1
      verbose: (optional) if True then report status for all
        files checked; otherwise only report summary

    Returns:
      Zero on success, 1 if errors were encountered

    """
    # Set up reporter object
    reporter = Md5sum.Md5CheckReporter()
    # Compare files
    reporter.add_result(filen1,Md5sum.Md5Checker.md5cmp_files(filen1,filen2))
    if verbose:
        if reporter.n_ok:
            print("OK: MD5 sums match")
        elif reporter.n_failed:
            print("FAILED: MD5 sums don't match")
        else:
            print("ERROR: unable to compute one or both MD5 sums")
    return reporter.status
Exemple #7
0
def compute_md5sum_for_file(filen,output_file=None):
    """Compute and write MD5 sum for specifed file

    Computes the MD5 sum for a file, and writes the sum and the file
    name either to stdout or to the specified file name.

    Note that the output format is compatible with the Linux
    'md5sum' program's '-c' option.

    Arguments:
      filen: file to compute the MD5 sum for
      output_file: (optional) name of file to write MD5 sum to

    Returns:
      Zero on success, 1 if errors were encountered

    """
    retval = 0
    if output_file:
        fp = io.open(output_file,'wt')
    else:
        fp = sys.stdout
    try:
        chksum = Md5sum.md5sum(filen)
        fp.write(u"%s  %s\n" % (chksum,filen))
    except IOError as ex:
        # Error accessing file, report and skip
        logging.error("%s: error while generating MD5 sum: '%s'" % (filen,ex))
        retval = 1
    if output_file:
        fp.close()
    return retval
def write_checksums(project,pattern=None,filen=None,relative=True):
    """Write MD5 checksums for fastq files with an AnalysisProject

    Arguments:
      project: AnalysisProject instance
      pattern: if supplied then use the supplied pattern
        to filter fastqs based on filename
      filen: if supplied then checksums will be written
        to this file; otherwise they will be written to
        stdout (default)
      relative: if True (default) then fastq file names
        will be the basename; otherwise they will be the
        full paths.

    """
    if filen:
        fp = open(md5file,'w')
    else:
        fp = sys.stdout
    for sample_name,fastq,fq in get_fastqs(project,pattern=pattern):
        if relative:
            name = os.path.basename(fq)
        else:
            name = fq
        fp.write("%s  %s\n" % (md5sum.md5sum(fq),name))
    if filen:
        fp.close()
Exemple #9
0
def diff_directories(dirn1,dirn2,verbose=False):
    """Check one directory against another using MD5 sums

    This compares one directory against another by computing the
    MD5 sums for the contents of the first, and then checking these
    against the second.

    (Essentially this is automatically performing the compute/verify
    steps in a single operation.)

    Note that if there are different files in one directory compared
    with the other then this function will give different results
    depending on the order the directories are specified. However
    for common files the actual MD5 sums will be the same regardless
    of order.

    Arguments:
      dirn1: "source" directory
      dirn2: "target" directory to be compared to dirn1
      verbose: (optional) if True then report status for all
        files checked; otherwise only report summary

    Returns:
      Zero on success, 1 if errors were encountered

    """
    # Set up reporter object
    reporter = Md5sum.Md5CheckReporter(Md5sum.Md5Checker.md5cmp_dirs(dirn1,dirn2),
                                       verbose=verbose)
    # Summarise
    if verbose: reporter.summary()
    return reporter.status
def copy_to_dest(f, dirn, chksum=None, link=False):
    """Copy a file to a local or remote destination

    Raises an exception if the copy operation fails.

    If 'chksum' argument is supplied then the MD5 sum of
    the copy is also verified against this and an
    exception is raised if this fails to match.

    Arguments:
      f: file to copy (must be local)
      dirn: target directory, either local or of the form
        "[user@]host:dir"
      chksum: (optional) MD5 sum of the original file
        to match against the copy
      link: (optional) if True then hard link files
        instead of copying
    
    """
    if not exists(f):
        raise Exception("'%s': not found" % f)
    if not exists(dirn):
        raise Exception("'%s': destination not found" % dirn)
    # Copy the file
    copy(f, dirn, link=link)
    if chksum is not None:
        user, host, dest = utils.split_user_host_dir(dirn)
        remote = (host is not None)
        if not remote:
            # Check local copy
            if chksum is not None:
                if md5sum.md5sum(f) != chksum:
                    raise Exception("MD5 checksum failed for "
                                    "copy of %s" % f)
        else:
            # Check remote copy
            try:
                # Run md5sum -c on the remote system
                if chksum is not None:
                    md5sum_check = applications.general.ssh_command(
                        user, host,
                        ('echo', '"%s  %s"' %
                         (chksum, os.path.join(dest, os.path.basename(f))),
                         '|', 'md5sum', '-c'))
                    print("Running %s" % md5sum_check)
                    md5sum_check.run_subprocess()
            except Exception as ex:
                raise Exception("Failed to copy %s to %s: %s" % (f, dirn, ex))
Exemple #11
0
def print_md5sums(library):
    """Calculate and print md5sums for primary data files in library

    This will generate a list of md5sums that can be passed to the
    md5sum program to check against a copy of the the runs using

    md5sum -c CHECKSUMS

    Arguments:
      library: SolidLibrary instance.
    """
    # F3 primary data
    try:
        print "%s  %s" % (Md5sum.md5sum(
            library.csfasta), strip_prefix(library.csfasta, os.getcwd()))
    except Exception, ex:
        logging.error("FAILED for F3 csfasta: %s" % ex)
def print_md5sums(library):
    """Calculate and print md5sums for primary data files in library

    This will generate a list of md5sums that can be passed to the
    md5sum program to check against a copy of the the runs using

    md5sum -c CHECKSUMS

    Arguments:
      library: SolidLibrary instance.
    """
    # F3 primary data
    try:
        print "%s  %s" % (Md5sum.md5sum(library.csfasta),
                          strip_prefix(library.csfasta,os.getcwd()))
    except Exception,ex:
        logging.error("FAILED for F3 csfasta: %s" % ex)
def copy_to_dest(f,dirn,chksum=None):
    """Copy a file to a local or remote destination

    Raises an exception if the copy operation fails.

    If 'chksum' argument is supplied then the MD5 sum of
    the copy is also verified against this and an
    exception is raised if this fails to match.

    Arguments:
      f: file to copy (must be local)
      dirn: target directory, either local or of the form
        "[user@]host:dir"
      chksum: (optional) MD5 sum of the original file
        to match against the copy
    
    """
    if not os.path.exists(f):
        raise Exception("File %s doesn't exist" % f)
    user,host,dest = utils.split_user_host_dir(dirn)
    remote = (host is not None)
    if not remote:
        # Local copy
        shutil.copy(f,dirn)
        if chksum is not None:
            if md5sum.md5sum(f) != chksum:
                raise Exception("MD5 checksum failed for copy of %s" % f)
    else:
        # Remote copy
        try:
            scp = applications.general.scp(user,host,f,dest)
            print "Running %s" % scp
            scp.run_subprocess()
            # Run md5sum -c on the remote system
            if chksum is not None:
                md5sum_check = applications.general.ssh_command(
                    user,host,
                    ('echo',
                     '"%s  %s"' % (chksum,
                                   os.path.join(dest,os.path.basename(f))),
                    '|','md5sum','-c'))
                print "Running %s" % md5sum_check
                md5sum_check.run_subprocess()
        except Exception, ex:
            raise Exception("Failed to copy %s to %s: %s" % (f,dirn,ex))
def copy_to_dest(f, dirn, chksum=None):
    """Copy a file to a local or remote destination

    Raises an exception if the copy operation fails.

    If 'chksum' argument is supplied then the MD5 sum of
    the copy is also verified against this and an
    exception is raised if this fails to match.

    Arguments:
      f: file to copy (must be local)
      dirn: target directory, either local or of the form
        "[user@]host:dir"
      chksum: (optional) MD5 sum of the original file
        to match against the copy
    
    """
    if not os.path.exists(f):
        raise Exception("File %s doesn't exist" % f)
    user, host, dest = utils.split_user_host_dir(dirn)
    remote = (host is not None)
    if not remote:
        # Local copy
        shutil.copy(f, dirn)
        if chksum is not None:
            if md5sum.md5sum(f) != chksum:
                raise Exception("MD5 checksum failed for copy of %s" % f)
    else:
        # Remote copy
        try:
            scp = applications.general.scp(user, host, f, dest)
            print "Running %s" % scp
            scp.run_subprocess()
            # Run md5sum -c on the remote system
            if chksum is not None:
                md5sum_check = applications.general.ssh_command(
                    user, host,
                    ('echo', '"%s  %s"' %
                     (chksum, os.path.join(dest, os.path.basename(f))), '|',
                     'md5sum', '-c'))
                print "Running %s" % md5sum_check
                md5sum_check.run_subprocess()
        except Exception, ex:
            raise Exception("Failed to copy %s to %s: %s" % (f, dirn, ex))
Exemple #15
0
    This will generate a list of md5sums that can be passed to the
    md5sum program to check against a copy of the the runs using

    md5sum -c CHECKSUMS

    Arguments:
      library: SolidLibrary instance.
    """
    # F3 primary data
    try:
        print "%s  %s" % (Md5sum.md5sum(
            library.csfasta), strip_prefix(library.csfasta, os.getcwd()))
    except Exception, ex:
        logging.error("FAILED for F3 csfasta: %s" % ex)
    try:
        print "%s  %s" % (Md5sum.md5sum(
            library.qual), strip_prefix(library.qual, os.getcwd()))
    except Exception, ex:
        logging.error("FAILED for F3 qual: %s" % ex)
    # F5 primary data
    if library.parent_sample.parent_run.is_paired_end:
        try:
            print "%s  %s" % (Md5sum.md5sum(library.csfasta_f5),
                              strip_prefix(library.csfasta_f5, os.getcwd()))
        except Exception, ex:
            logging.error("FAILED for F5 csfasta: %s" % ex)
        try:
            print "%s  %s" % (Md5sum.md5sum(
                library.qual_f5), strip_prefix(library.qual_f5, os.getcwd()))
        except Exception, ex:
            logging.error("FAILED for F5 qual: %s" % ex)
Exemple #16
0
    This will generate a list of md5sums that can be passed to the
    md5sum program to check against a copy of the the runs using

    md5sum -c CHECKSUMS

    Arguments:
      library: SolidLibrary instance.
    """
    # F3 primary data
    try:
        print "%s  %s" % (Md5sum.md5sum(library.csfasta),
                          strip_prefix(library.csfasta,os.getcwd()))
    except Exception,ex:
        logging.error("FAILED for F3 csfasta: %s" % ex)
    try:
        print "%s  %s" % (Md5sum.md5sum(library.qual),
                          strip_prefix(library.qual,os.getcwd()))
    except Exception,ex:
        logging.error("FAILED for F3 qual: %s" % ex)
    # F5 primary data
    if library.parent_sample.parent_run.is_paired_end:
        try:
            print "%s  %s" % (Md5sum.md5sum(library.csfasta_f5),
                              strip_prefix(library.csfasta_f5,os.getcwd()))
        except Exception,ex:
            logging.error("FAILED for F5 csfasta: %s" % ex)
        try:
            print "%s  %s" % (Md5sum.md5sum(library.qual_f5),
                              strip_prefix(library.qual_f5,os.getcwd()))
        except Exception,ex:
            logging.error("FAILED for F5 qual: %s" % ex)
Exemple #17
0
        total_size = 0
        for fq in fastqs:
            fsize = os.lstat(fq).st_size
            total_size += fsize
            print "%s\t%s" % (os.path.basename(fq),
                              bcf_utils.format_file_size(fsize))
        print "Total: %s" % bcf_utils.format_file_size(total_size)
        # Generate MD5 checksum file
        if not options.dry_run:
            tmpdir = tempfile.mkdtemp(suffix='checksums.md5',
                                      dir=os.getcwd())
            md5_file = os.path.join(tmpdir,'checksums.md5')
            print "Generating MD5 sums in %s" % md5_file
            fp = open(md5_file,'w')
            for fq in fastqs:
                chksum = Md5sum.md5sum(fq)
                fp.write("%s  %s\n" % (chksum,os.path.basename(fq)))
            fp.close()
        # Copy the fastqs
        print "Copying fastqs"
        for fq in fastqs:
            print "%s" % os.path.basename(fq)
            if not options.dry_run:
                copy_to_dest(fq,args[1])
        if not options.dry_run:
            print "Copying MD5 checksum file"
            copy_to_dest(md5_file,args[1])
            shutil.rmtree(tmpdir)


    This will generate a list of md5sums that can be passed to the
    md5sum program to check against a copy of the the runs using

    md5sum -c CHECKSUMS

    Arguments:
      library: SolidLibrary instance.
    """
    # F3 primary data
    try:
        print "%s  %s" % (Md5sum.md5sum(library.csfasta),
                          strip_prefix(library.csfasta,os.getcwd()))
    except Exception,ex:
        logging.error("FAILED for F3 csfasta: %s" % ex)
    try:
        print "%s  %s" % (Md5sum.md5sum(library.qual),
                          strip_prefix(library.qual,os.getcwd()))
    except Exception,ex:
        logging.error("FAILED for F3 qual: %s" % ex)
    # F5 primary data
    if library.parent_sample.parent_run.is_paired_end:
        try:
            print "%s  %s" % (Md5sum.md5sum(library.csfasta_f5),
                              strip_prefix(library.csfasta_f5,os.getcwd()))
        except Exception,ex:
            logging.error("FAILED for F5 csfasta: %s" % ex)
        try:
            print "%s  %s" % (Md5sum.md5sum(library.qual_f5),
                              strip_prefix(library.qual_f5,os.getcwd()))
        except Exception,ex:
            logging.error("FAILED for F5 qual: %s" % ex)
Exemple #19
0
class ArchiveFile(utils.PathInfo):
    """
    Class for storing information about a file

    """
    def __init__(self, filen):
        """
        Create and populate a new ArchiveFile instance
        """
        utils.PathInfo.__init__(self, filen)
        # !!!FIXME should be able to st_size from PathInfo!!!
        self.size = os.lstat(filen).st_size
        self.timestamp = self.mtime
        self.ext, self.compression = get_file_extensions(filen)
        self.md5 = None
        self.uncompressed_md5 = None

    @property
    def basename(self):
        """
        Return the basename of the file path
        """
        return os.path.basename(self.path)

    @property
    def classifier(self):
        """
        Return classifier for an ArchiveFile object

        Return an indicator consistent with 'ls -F' depending
        on file type:

        / indicates a directory
        @ indicates a link
        * indicates an executable

        Empty string indicates a regular file.
        """
        if self.is_link:
            return '@'
        elif self.is_dir:
            return os.sep
        elif self.is_executable:
            return '*'
        return ''

    def get_md5sums(self):
        """
        Generate MD5sums

        Generate and return MD5 sums for the file and
        for the uncompressed contents.

        Sets the 'md5' and 'uncompressed_md5' properties on
        the current instance.

        Returns tuple (md5,md5_uncompressed_contents).

        """
        if self.is_link or self.is_dir:
            # Ignore links or directories
            return (None, None)
        if self.md5 is None:
            # Generate MD5 sum
            self.md5 = Md5sum.md5sum(self.path)
        if self.uncompressed_md5 is None:
            # Generate MD5 for uncompressed contents
            if not self.compression:
                self.uncompressed_md5 = self.md5
            elif self.compression == 'bz2':
                fp = bz2.BZ2File(self.path, 'r')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            elif self.compression == 'gz':
                fp = gzip.GzipFile(self.path, 'rb')
                self.uncompressed_md5 = Md5sum.md5sum(fp)
            else:
                logging.warning("%s: md5sums not implemented for "
                                "compression type '%s'" %
                                (self, self.compression))
        return (self.md5, self.uncompressed_md5)

    def compress(self, dry_run=False):
        """
        Compress the file

        Performs compression using bzip2, and transfers
        the timestamp from the original file to the
        compressed version.

        If 'dry_run' is True then report the compression
        operation but don't report anything.
              
        Returns status:

        0 indicates success
        -1 indicates nothing to do, no error
        >0 indicates an error

        """
        if self.compression:
            logging.warning("%s: already compressed" % self)
            return -1
        # Check for existing compressed file
        bz2file = self.path + '.bz2'
        if os.path.exists(bz2file):
            logging.warning("%s: compressed copy already exists" % self)
            return -1
        # Get MD5 checksum
        self.get_md5sums()
        checksum = self.md5
        # Capture timestamp for parent directory
        parent_mtime = os.lstat(os.path.dirname(self.path)).st_mtime
        # Compress to a temp file
        bzip2_cmd = applications.Command('bzip2', '-c', self.path)
        print bzip2_cmd
        if dry_run:
            return -1
        fd, tmpbz2 = tempfile.mkstemp(dir=os.path.dirname(self.path),
                                      suffix='.bz2.tmp')
        # Execute the compression command
        try:
            status = bzip2_cmd.run_subprocess(log=tmpbz2)
        except Exception, ex:
            logging.error("Exception compressing %s: %s" % (self, ex))
            status = 1
        if status != 0:
            logging.error("Compression failed for %s" % self)
        else:
            # Verify the checksum for the contents of the
            # compressed file
            uncompressed_checksum = Md5sum.md5sum(bz2.BZ2File(tmpbz2, 'r'))
            if uncompressed_checksum == checksum:
                # Rename the compressed file, reset the timestamps
                # and remove the source
                os.rename(tmpbz2, bz2file)
                os.utime(bz2file, (self.mtime, self.mtime))
                os.remove(self.path)
                os.utime(os.path.dirname(self.path),
                         (parent_mtime, parent_mtime))
                # Update attributes
                self.__path = bz2file
                self.__st = os.lstat(self.__path)
                self.compression = 'bz2'
                self.md5 = None
            else:
                logging.error("Bad checksum for compressed version of %s" %
                              self)
                status = 1
        # Remove the temp file
        if os.path.exists(tmpbz2):
            os.remove(tmpbz2)
        # Finish
        return status