def read(self, bs=2 * 2**20): chunk = self.src.read(bs) if self.stat() != self.src_meta: # Bail out if file changes while it's being hashed self.q( 'UPDATE files SET dirty = 1,' ' last_skip = ? WHERE path = ?', (time(), self.meta['path']) ) return 0 if chunk: self.src_checksum.update(chunk) self.fadvise(len(chunk)) else: digest = self.src_checksum.digest() size, ctime, mtime = self.src_meta if self.meta['checksum'] != digest: # either new hash or changes if self.meta['checksum'] is not None: # can still be intentional change w/ reverted mtime if max(abs(self.meta['ctime'] - ctime), abs(self.meta['mtime'] - mtime)) >= 1: self.log.info(force_unicode( 'Detected change in' ' file contents and ctime: {}'.format(self.meta['path']) )) else: # bitrot!!! self.log.error(force_unicode( 'Detected' ' unmarked changes: {}'.format(self.meta['path']) )) # Update with last-seen metadata, # regardless of what was set in metadata_check() self.q( 'UPDATE files SET dirty = 0, clean = 1,' ' size = ?, mtime = ?, ctime = ?, checksum = ?, last_scrub = ?,' ' last_skip = NULL WHERE path = ?', (size, mtime, ctime, digest, time(), self.meta['path']) ) return len(chunk)
def get_file_to_scrub(self, skip_for=3 * 3600, skip_until=0): while True: query_base = 'SELECT * FROM files WHERE generation = ?'\ ' AND (last_skip IS NULL OR last_skip < ?) {} ORDER BY last_scrub LIMIT 1' query_params = [self.generation, skip_until] # First try to hash not-yet-seen files with self._cursor(query_base.format('AND checksum IS NULL'), query_params) as c: row = c.fetchone() if not row: # Then dirty (changed) files with self._cursor(query_base.format('AND dirty = 1'), query_params) as c: row = c.fetchone() if not row: # Then just not-yet-checked for this generation with self._cursor(query_base.format('AND clean = 0'), query_params) as c: row = c.fetchone() if not row and skip_until == 0: # Then try to find a path that was skipped a while ("skip_for") ago skip_until = time() - skip_for if skip_until != 0: return self.get_file_to_scrub(skip_until=skip_until) if not row: return # nothing more/yet to check try: src = open(row['path']) except (IOError, OSError): self._log.debug(force_unicode( 'Failed to open' ' scanned path, skipping it: {}'.format(row['path']) )) self.drop_file(row['path']) continue return FileNode( self._query, self._log, src, row, checksum=self._checksum, use_fadvise=self._use_fadvise )
def __init__(self, query_func, log, src, row, checksum, use_fadvise=True): self.q, self.log, self.meta, self.src = query_func, log, row, src self.log.debug(force_unicode('Checking file: {}'.format(row['path']))) self.src_meta, self.src_checksum = self.stat(), checksum() self.src_fadvise = bool(use_fadvise) if use_fadvise\ and use_fadvise is not True\ and isinstance(use_fadvise, (int, long)): self.src_fadvise_bs = use_fadvise self.fadvise(seq=True)
def _cursor(self, query, params=tuple(), **kwz): if self._log_sql: self._log.debug(force_unicode('Query: {!r}, data: {!r}'.format(query, params))) try: with closing(self._db.execute(query, params, **kwz)) as c: yield c finally: self._db_seq, ts = self._db_seq + 1, time() if (self._db_ts_limit and (ts - self._db_ts) >= self._db_ts_limit)\ or (self._db_seq_limit and self._db_seq >= self._db_seq_limit): self._db.commit() self._db_seq = 0 self._db_ts = ts
def file_list(paths, xdev=True, path_filter=list()): _check_filters = ft.partial(check_filters, filters=path_filter) paths = set(it.imap(realpath, paths)) log = logging.getLogger('bitrot_scrubber.walk') while paths: path_base = paths.pop() try: path_base_dev = os.stat(path_base).st_dev except (OSError, IOError): log.info(force_unicode('Unable to access scrub-path: {}'.format(path_base))) continue for p, dirs, files in os.walk(path_base, topdown=True): if xdev and p not in paths and os.stat(p).st_dev != path_base_dev: log.info(force_unicode('Skipping mountpoint: {}'.format(p))) while dirs: dirs.pop() # don't descend into anything here continue paths.discard(p) i_off = 0 for i, name in list(enumerate(dirs)): path = join(p, name) # Filtered-out dirs won't be descended into if not _check_filters(path + '/'): del dirs[i - i_off] i_off += 1 # original list just became shorter for name in files: path = join(p, name) if not _check_filters(path): continue try: fstat = os.lstat(path) except (IOError, OSError): # file vanished log.info(force_unicode('Failed to stat path: {}'.format(path))) continue if not stat.S_ISREG(fstat.st_mode): continue yield path, fstat
def scrub( paths, meta_db, xdev=True, path_filter=list(), scan_only=False, resume=False, skip_for=3 * 3600, bs=4 * 2**20, rate_limits=None ): log = logging.getLogger('bitrot_scrubber.scrub') meta_db.set_generation(new=not resume) log.info('Scrub generation number: {}'.format(meta_db.generation)) if not resume: scan_limit = getattr(rate_limits, 'scan', None) if not scan_only: read_limit = getattr(rate_limits, 'read', None) ts_scan = ts_read = 0 # deadline for the next iteration file_node = None # currently scrubbed (checksummed) file if not resume: ## Scan for path, fstat in file_list(paths, xdev=xdev, path_filter=path_filter): log.debug(force_unicode('Scanning path: {}'.format(path))) # Bumps generaton number on path as well, to facilitate cleanup meta_db.metadata_check( path, size=fstat.st_size, mtime=fstat.st_mtime, ctime=fstat.st_ctime ) # Scan always comes first, unless hits the limit if not scan_limit: continue ts, delay = time(), scan_limit.send(1) if not delay: continue ts_scan = ts + delay while True: if ts >= ts_scan: break # get back to scan asap if not scan_only and not file_node: # pick next node file_node = meta_db.get_file_to_scrub(skip_for=skip_for) if ts_scan < ts_read or not file_node: delay = ts_scan - ts if delay > 0: # log.debug('Rate-limiting delay (scan): {:.1f}s'.format(delay)) sleep(delay) break bs_read = file_node.read(bs) if not bs_read: # done with this one file_node.close() file_node = None ts = time() if read_limit: delay = read_limit.send(bs_read) if delay: ts_read = ts + delay if ts_read < ts_scan: # log.debug('Rate-limiting delay (read): {:.1f}s'.format(delay)) sleep(delay) ts = time() ## Drop all meta-nodes for files with old generation meta_db.metadata_clean() if scan_only: return ## Check the rest of non-clean files in this gen while True: if not file_node: file_node = meta_db.get_file_to_scrub(skip_for=skip_for) if not file_node: break bs_read = file_node.read(bs) if not bs_read: file_node.close() file_node = None if read_limit: delay = read_limit.send(bs_read) if delay: # log.debug('Rate-limiting delay (read): {:.1f}s'.format(delay)) sleep(delay)