Ejemplo n.º 1
0
def hashlist_generate(srcpath, opts, source_mode=True, existing_hashlist=None):
    '''
    Generate the hashlist for the given path.

    srcpath - the top-level directory
    opts    - the optparse options dict

    Return a list of FileHash objects representing objects in the srcpath
    filesystem.

    If opts.trim_path is True, strip the srcpath from the filename in the
    hashlist. This makes it easier to work with relative paths.

    opts.no_ignore_dirs and opts.no_ignore_files disable the default
    behaviour, which is to ignore of common dirs (CVS, .git, .svn) and files
    (*~, *.swp).

    '''

    log.debug("hashlist_generate: srcpath %s source_mode %s", srcpath,
              source_mode)
    if os.path.exists(srcpath):
        if not os.path.isdir(srcpath):
            raise NonDirFoundAtDirLocationError(
                "'%s' found but is not a directory" % srcpath)
    else:
        os.mkdir(srcpath)

    # If we're the target, defer reading file contents until we're asked to
    # compare. That way, if we're trusting the mtime, we may not have to read
    # the file at all.
    if opts.always_checksum:
        defer_fs_read = False
    else:
        defer_fs_read = True

    lookup_existing = None
    source_extramsg = ''

    # If we have an existing hashfile, also defer reading the file, as we may
    # be able to avoid that if we trust mtimes.
    if existing_hashlist is not None:
        lookup_existing = hashlist_to_dict(existing_hashlist)
        defer_fs_read = True
        source_extramsg = ' (with cache)'

    hashlist = get_hashlist(opts)

    if not opts.quiet:
        if source_mode:
            print("Scanning source filesystem%s" % (source_extramsg))
        else:
            print("Comparing local filesystem to signature file%s" %
                  (source_extramsg))

    if opts.progress and source_mode:
        verb = "Add"
    else:
        verb = "Scan"

    re_globmatch = re.compile(r'[*?\[\]]')

    if opts.exclude_dir:
        excdirs = set(
            [d for d in opts.exclude_dir if not re_globmatch.search(d)])
        excdirs_glob = set([d for d in opts.exclude_dir if d not in excdirs])
    else:
        excdirs = set()

    ##
    # Walk the filesystem.
    ##
    for root, dirs, files in os.walk(srcpath):

        relroot = root[len(srcpath) + 1:]

        if log.isEnabledFor(logging.DEBUG):
            log.debug("os.walk: root %s dirs %s files %s", root, dirs, files)

        # See if the directory list can be pruned.
        # XXX refactor.

        if not opts.no_ignore_dirs:

            copydirs = dirs[:]  # Don't iterate over a list we may change.

            for dirname in copydirs:
                fulldirname = os.path.join(relroot, dirname)
                for di in dirignore:
                    # dirignore is a regex anchored to the start - need to
                    # use the short dirname, e.g. 'CVS', as opposed to the
                    # long dirname, 'stuff/CVS'.
                    if di.search(dirname):
                        if source_mode and opts.verbose:
                            print("Skipping ignore-able dir %s" % dirname)
                        dirs.remove(dirname)
                        log.debug("Exclude dir '%s' full path '%s'", dirname,
                                  fulldirname)

        # Likewise, handle the user's exclusions. This makes the assumption
        # that the list of exclusions will not be much larger than the list of
        # directories.

        if opts.exclude_dir:
            done_skip = False
            # Don't iterate over a list we'll be changing inside the loop.
            copydirs = dirs[:]
            for dirname in copydirs:
                fulldirname = os.path.join(relroot, dirname)

                if is_dir_excluded(fulldirname, excdirs, excdirs_glob):
                    log.debug("Exclude dir '%s' full path '%s'", dirname,
                              fulldirname)
                    dirs.remove(dirname)
                    done_skip = True

            if done_skip:
                log.debug("dirs now %s", dirs)

        # Handle directories.
        for n, dirname in enumerate(dirs, start=1):
            fpath = os.path.join(root, dirname)
            fh = FileHash.init_from_file(fpath,
                                         trim=opts.trim_path,
                                         root=srcpath,
                                         defer_read=defer_fs_read)
            if opts.progress:
                print("D: %s dir %s (dir-in-dir %d/%d)" %
                      (verb, fpath, n, len(dirs)))
            elif opts.verbose:
                print("%s dir: %s" % (verb, fpath))
            hashlist.append(fh)

        files.sort()

        for n, filename in enumerate(files, start=1):

            fpath = os.path.join(root, filename)

            # Don't include hashfiles or lockfiles.
            if is_hashfile(filename,
                           custom_hashfile=opts.hash_file,
                           guess_sigfiles=opts.guess_sigfiles):
                log.debug("Skipping hash file or lock '%s'", filename)
                continue

            skipped = False
            if not opts.no_ignore_files:
                for fi in fileignore:
                    if fi.search(fpath):
                        if source_mode and opts.verbose:
                            print("Ignore:  %s" % fpath)
                        skipped = True
                        break

            if skipped:
                continue

            log.debug("Add file: %s", fpath)

            if opts.progress:
                print("F: %s [dir %s] file %s (file-in-dir %d/%d)" %
                      (verb, root, filename, n, len(files)))
            elif opts.verbose:
                print("%s file: %s" % (verb, fpath))

            fh = FileHash.init_from_file(fpath,
                                         trim=opts.trim_path,
                                         root=srcpath,
                                         defer_read=defer_fs_read)

            if not opts.always_checksum and fh.is_file:
                # Attempt to bypass the checksum, if the old HSYNC.SIG has it.
                do_checksum = True

                if lookup_existing is not None:
                    if fh.fpath in lookup_existing:
                        oldfh = lookup_existing[fh.fpath]
                        log.debug("'%s': Found old entry (%s)", fh.fpath,
                                  repr(oldfh))
                        if fh.safe_to_skip(oldfh):
                            do_checksum = False
                            fh.inherit_attributes(oldfh)

                if do_checksum:
                    log.debug("'%s': fall back to reading file", fh.fpath)
                    fh.read_file_contents()

            log.debug("'%s': Adding to hash list", fh.fpath)
            assert fh.hashstr != fh.notsethash
            hashlist.append(fh)

    if opts.scan_debug:
        _scan_debug(hashlist)

    log.debug("hashlist_generate: entries %d", len(hashlist))
    return hashlist
Ejemplo n.º 2
0
def hashlist_check(dstpath,
                   src_hashlist,
                   opts,
                   existing_hashlist=None,
                   opportunistic_write=False,
                   opwrite_path=None,
                   source_side=False):
    '''
    Check the dstpath against the provided hashlist.

    Return a tuple (needed, notneeded, dst_hashlist), where needed is a list
    of filepaths that need to be fetched, and notneeded is a list of filepaths
    that are not present on the target, so may be removed. dst_hashlist is
    a list of FileHash objects for the destination path.
    '''

    log.debug("hashlist_check():")

    src_fdict = hashlist_to_dict(src_hashlist)

    # Take the simple road. Generate a hashlist for the destination.
    dst_hashlist = hashlist_generate(dstpath,
                                     opts,
                                     source_mode=False,
                                     existing_hashlist=existing_hashlist)

    no_compress = False
    if source_side:
        no_compress = True

    if opportunistic_write:
        assert opwrite_path is not None
        sigfile_write(dst_hashlist,
                      opwrite_path,
                      opts,
                      use_tmp=True,
                      verb='Caching scanned',
                      no_compress=no_compress)

    dst_fdict = hashlist_to_dict(dst_hashlist)

    re_globmatch = re.compile(r'[*?\[\]]')

    if opts.exclude_dir:
        direx = set(
            [d for d in opts.exclude_dir if not re_globmatch.search(d)])
        direx_glob = set([d for d in opts.exclude_dir if d not in direx])
    else:
        direx = set()
        direx_glob = set()

    # Now compare the two dictionaries.
    needed = get_hashlist(opts)
    excluded_dirs = set()

    mapper = UidGidMapper()
    if opts.set_user:
        mapper.set_default_name(opts.set_user)
    if opts.set_group:
        mapper.set_default_group(opts.set_group)

    for fpath, fh in src_fdict.iteritems():

        # Generate (pointless) stat.
        if not fh.is_dir and fh.size_is_known:
            opts.stats.bytes_total += fh.size

        assert fpath == fh.fpath

        # Process exclusions.

        filename = os.path.basename(fpath)
        if filename != '' and \
                is_hashfile(filename, custom_hashfile=opts.hash_file,
                            guess_sigfiles=opts.guess_sigfiles):
            log.debug("needed: skipping hash file or lock '%s'", filename)
            continue

        if is_path_pre_excluded(fpath, excluded_dirs):
            continue

        if fh.is_dir:
            if is_dir_excluded(fpath, direx, direx_glob, excluded_dirs):
                log.debug("Dir '%s' excluded", fpath)
                continue

        # If the user overrode stuff, set that up here.
        if opts.set_user:
            fh.uid = mapper.default_uid
            fh.user = mapper.default_name

        if opts.set_group:
            fh.gid = mapper.default_gid
            fh.group = mapper.default_group

        if fpath in dst_fdict:

            if not src_fdict[fpath].compare(
                    dst_fdict[fpath],
                    ignore_mode=opts.ignore_mode,
                    trust_mtime=(not opts.always_checksum)):
                log.debug("%s: needed", fpath)
                # Store a reference to the object at the destination.
                # This can be used to update the dest's HSYNC.SIG file and
                # save on rechecks.
                fh.associated_dest_object = dst_fdict[fpath]
                needed.append(fh)

        else:
            log.debug("%s: needed", fpath)
            fh.dest_missing = True
            needed.append(fh)

    not_needed = get_hashlist(opts)
    for fpath, fh in dst_fdict.iteritems():

        filename = os.path.basename(fpath)
        if filename != '' and \
                is_hashfile(filename, custom_hashfile=opts.hash_file,
                            guess_sigfiles=opts.guess_sigfiles):
            log.debug("not_needed: skipping hash file or lock '%s'", filename)
            continue

        if fpath not in src_fdict:
            log.debug("%s: not found in source", fpath)
            not_needed.append(fh)

    if opts.check_debug:
        _check_debug(needed, not_needed)

    return (needed, not_needed, dst_hashlist)