Example #1
0
	def read(self, bs=2 * 2**20):
		chunk = self.src.read(bs)
		if self.stat() != self.src_meta:
			# Bail out if file changes while it's being hashed
			self.q( 'UPDATE files SET dirty = 1,'
				' last_skip = ? WHERE path = ?', (time(), self.meta['path']) )
			return 0
		if chunk:
			self.src_checksum.update(chunk)
			self.fadvise(len(chunk))
		else:
			digest = self.src_checksum.digest()
			size, ctime, mtime = self.src_meta
			if self.meta['checksum'] != digest: # either new hash or changes
				if self.meta['checksum'] is not None: # can still be intentional change w/ reverted mtime
					if max(abs(self.meta['ctime'] - ctime), abs(self.meta['mtime'] - mtime)) >= 1:
						self.log.info(force_unicode( 'Detected change in'
							' file contents and ctime: {}'.format(self.meta['path']) ))
					else: # bitrot!!!
						self.log.error(force_unicode( 'Detected'
							' unmarked changes: {}'.format(self.meta['path']) ))
			# Update with last-seen metadata,
			#  regardless of what was set in metadata_check()
			self.q( 'UPDATE files SET dirty = 0, clean = 1,'
					' size = ?, mtime = ?, ctime = ?, checksum = ?, last_scrub = ?,'
					' last_skip = NULL WHERE path = ?',
				(size, mtime, ctime, digest, time(), self.meta['path']) )
		return len(chunk)
Example #2
0
	def get_file_to_scrub(self, skip_for=3 * 3600, skip_until=0):
		while True:
			query_base = 'SELECT * FROM files WHERE generation = ?'\
				' AND (last_skip IS NULL OR last_skip < ?) {} ORDER BY last_scrub LIMIT 1'
			query_params = [self.generation, skip_until]
			# First try to hash not-yet-seen files
			with self._cursor(query_base.format('AND checksum IS NULL'), query_params) as c:
				row = c.fetchone()
			if not row:
				# Then dirty (changed) files
				with self._cursor(query_base.format('AND dirty = 1'), query_params) as c:
					row = c.fetchone()
			if not row:
				# Then just not-yet-checked for this generation
				with self._cursor(query_base.format('AND clean = 0'), query_params) as c:
					row = c.fetchone()
			if not row and skip_until == 0:
				# Then try to find a path that was skipped a while ("skip_for") ago
				skip_until = time() - skip_for
				if skip_until != 0: return self.get_file_to_scrub(skip_until=skip_until)
			if not row: return # nothing more/yet to check
			try: src = open(row['path'])
			except (IOError, OSError):
				self._log.debug(force_unicode( 'Failed to open'
					' scanned path, skipping it: {}'.format(row['path']) ))
				self.drop_file(row['path'])
				continue
			return FileNode( self._query, self._log, src, row,
				checksum=self._checksum, use_fadvise=self._use_fadvise )
Example #3
0
	def __init__(self, query_func, log, src, row, checksum, use_fadvise=True):
		self.q, self.log, self.meta, self.src = query_func, log, row, src
		self.log.debug(force_unicode('Checking file: {}'.format(row['path'])))
		self.src_meta, self.src_checksum = self.stat(), checksum()

		self.src_fadvise = bool(use_fadvise)
		if use_fadvise\
				and use_fadvise is not True\
				and isinstance(use_fadvise, (int, long)):
			self.src_fadvise_bs = use_fadvise
		self.fadvise(seq=True)
Example #4
0
	def _cursor(self, query, params=tuple(), **kwz):
		if self._log_sql:
			self._log.debug(force_unicode('Query: {!r}, data: {!r}'.format(query, params)))
		try:
			with closing(self._db.execute(query, params, **kwz)) as c: yield c
		finally:
			self._db_seq, ts = self._db_seq + 1, time()
			if (self._db_ts_limit and (ts - self._db_ts) >= self._db_ts_limit)\
					or (self._db_seq_limit and self._db_seq >= self._db_seq_limit):
				self._db.commit()
				self._db_seq = 0
			self._db_ts = ts
Example #5
0
def file_list(paths, xdev=True, path_filter=list()):
	_check_filters = ft.partial(check_filters, filters=path_filter)
	paths = set(it.imap(realpath, paths))
	log = logging.getLogger('bitrot_scrubber.walk')

	while paths:
		path_base = paths.pop()
		try: path_base_dev = os.stat(path_base).st_dev
		except (OSError, IOError):
			log.info(force_unicode('Unable to access scrub-path: {}'.format(path_base)))
			continue

		for p, dirs, files in os.walk(path_base, topdown=True):
			if xdev and p not in paths and os.stat(p).st_dev != path_base_dev:
				log.info(force_unicode('Skipping mountpoint: {}'.format(p)))
				while dirs: dirs.pop() # don't descend into anything here
				continue
			paths.discard(p)

			i_off = 0
			for i, name in list(enumerate(dirs)):
				path = join(p, name)
				# Filtered-out dirs won't be descended into
				if not _check_filters(path + '/'):
					del dirs[i - i_off]
					i_off += 1 # original list just became shorter

			for name in files:
				path = join(p, name)
				if not _check_filters(path): continue
				try: fstat = os.lstat(path)
				except (IOError, OSError): # file vanished
					log.info(force_unicode('Failed to stat path: {}'.format(path)))
					continue
				if not stat.S_ISREG(fstat.st_mode): continue
				yield path, fstat
Example #6
0
def scrub( paths, meta_db,
		xdev=True, path_filter=list(), scan_only=False, resume=False,
		skip_for=3 * 3600, bs=4 * 2**20, rate_limits=None ):
	log = logging.getLogger('bitrot_scrubber.scrub')

	meta_db.set_generation(new=not resume)
	log.info('Scrub generation number: {}'.format(meta_db.generation))

	if not resume: scan_limit = getattr(rate_limits, 'scan', None)
	if not scan_only: read_limit = getattr(rate_limits, 'read', None)
	ts_scan = ts_read = 0 # deadline for the next iteration

	file_node = None # currently scrubbed (checksummed) file

	if not resume:
		## Scan
		for path, fstat in file_list(paths, xdev=xdev, path_filter=path_filter):
			log.debug(force_unicode('Scanning path: {}'.format(path)))
			# Bumps generaton number on path as well, to facilitate cleanup
			meta_db.metadata_check( path,
				size=fstat.st_size, mtime=fstat.st_mtime, ctime=fstat.st_ctime )

			# Scan always comes first, unless hits the limit
			if not scan_limit: continue
			ts, delay = time(), scan_limit.send(1)
			if not delay: continue
			ts_scan = ts + delay

			while True:
				if ts >= ts_scan: break # get back to scan asap

				if not scan_only and not file_node: # pick next node
					file_node = meta_db.get_file_to_scrub(skip_for=skip_for)
				if ts_scan < ts_read or not file_node:
					delay = ts_scan - ts
					if delay > 0:
						# log.debug('Rate-limiting delay (scan): {:.1f}s'.format(delay))
						sleep(delay)
					break

				bs_read = file_node.read(bs)
				if not bs_read: # done with this one
					file_node.close()
					file_node = None
				ts = time()

				if read_limit:
					delay = read_limit.send(bs_read)
					if delay:
						ts_read = ts + delay
						if ts_read < ts_scan:
							# log.debug('Rate-limiting delay (read): {:.1f}s'.format(delay))
							sleep(delay)
							ts = time()

		## Drop all meta-nodes for files with old generation
		meta_db.metadata_clean()
		if scan_only: return

	## Check the rest of non-clean files in this gen
	while True:
		if not file_node: file_node = meta_db.get_file_to_scrub(skip_for=skip_for)
		if not file_node: break
		bs_read = file_node.read(bs)
		if not bs_read:
			file_node.close()
			file_node = None
		if read_limit:
			delay = read_limit.send(bs_read)
			if delay:
				# log.debug('Rate-limiting delay (read): {:.1f}s'.format(delay))
				sleep(delay)