Example #1
0
    def match(self):
        # Record matches (hash,size) pairs
        matches_done = set()

        # Iterate over targets
        try:
            for target, hash, db in self.hash(targets=self.settings.match_targets):
                matches = []

                # Find row data
                target_data = db.path_get_prime(target.true)
                if not target_data:
                    log.debug(PREFIX_SKIP + "%r (unable to find data)" % target.user)
                    continue

                # Already reported?
                if (target_data.hash, target_data.size) in matches_done:
                    log.debug(PREFIX_SKIP + "%r (already reported match)" % target.user)
                    continue
                matches_done.add((target_data.hash, target_data.size))

                # Search for duplicates
                for match_data in db.path_select_duplicates(
                    path=target_data.path, hash=target_data.hash, size=target_data.size
                ):
                    if not match_data.is_remote:
                        # local
                        if self.settings.match_verify:
                            try:
                                match_stat = os.lstat(match_data.path)
                            except OSError, ex:
                                log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex))
                                continue
                            except IOError, ex:
                                log.debug(PREFIX_SKIP + "%r (unable to lstat match): %s" % (match_data.path, ex))
                                continue

                            if not stat.S_ISREG(match_stat.st_mode):
                                log.debug(PREFIX_SKIP + "%r (not a regular file)" % match_data.path)
                                continue  # No longer a regular file

                            # Verify/update hash
                            match_hash = db.path_hash(match_data.path, match_stat)
                            if match_hash == None:
                                match_hash = build_hash(Walker.Target(match_data.path, match_data.path, match_stat))
                                if match_hash != None:
                                    db.path_setstat(match_data.path, match_stat, match_hash)
                            if match_hash == None:
                                log.debug(PREFIX_SKIP + "%r (unable to determine hash)" % match_data.path)
                                continue

                            # update data
                            match_data = match_data._replace(
                                hash=match_hash, size=match_stat.st_size, time=match_stat.st_mtime, mark=db.mark
                            )
                            if (match_data.hash != target_data.hash) or (match_data.size != target_data.size):
                                log.debug(PREFIX_SKIP + "%r (files no longer match)" % match_data.path)
                                continue  # skip if its no longer identical

                        log.verbose("comp %s" % match_data.path)
Example #2
0
    def hash(self, targets=None):
        # Setup logging
        log.setLevel(self.settings.verbosity)

        # Display configuration
        log.debug("%s" % self.settings)

        # Setup database
        log.debug("* setup database's...")
        db = HashDatabase2(self.settings.database)
        db.extend_locals(self.settings.databases_locals)
        if not db.open():
            return

        # Setup the walker
        log.debug("* setup walker...")
        walker = Walker()
        walker.walk_depth = self.settings.walk_depth
        walker.extend_targets(targets or self.settings.hash_targets)
        walker.extend_skip_fstypes(self.settings.skip_fstypes)
        walker.extend_skip_paths(self.settings.skip_paths)
        walker.extend_skip_names(self.settings.skip_names)
        walker.extend_skip_dirnames(self.settings.skip_dirnames)
        walker.extend_skip_filenames(self.settings.skip_filenames)
        walker.skip_mounts = self.settings.skip_mounts
        walker.skip_binds = self.settings.skip_binds
        walker.skip_symlinks = self.settings.skip_symlinks

        log.debug("* walk...")

        try:
            start_time = time.time()
            start_changes = db.connection.total_changes
            for target in walker.walk():
                target_hash = db.path_hash(target.true, target.stat)
                if target_hash == None:
                    target_hash = build_hash(target)
                    if target_hash != None:
                        db.path_setstat(target.true, target.stat, target_hash)
                if target_hash != None:
                    yield (target, target_hash, db)

                # Only commit every so often since we are limited by disk speed
                now = time.time()
                if ((now - start_time) >= THRESHHOLD_TIMEDELTA) or (
                    (db.connection.total_changes - start_changes) > THRESHHOLD_CHANGEDELTA
                ):
                    log.debug("* committing changes...")
                    db.connection.commit()
                start_time = time.time()
                start_changes = db.connection.total_changes
        finally:
            log.debug("* committing changes...")
            db.connection.commit()