def generate_fix_script(self, report, writer): last_host = None mode = self.settings_value().mode for rec in report.records: fs, reserve = rec[1], rec[4] if isinstance(fs, str): continue # end skip aggregation records - filesystem is a string in that case if last_host and last_host != fs.host: writer('# Cannot proceed as the host changed during iteration - please re-run the report with the hosts=name value set\n') break # end verify we stay on a single host if last_host is None: writer("# Reservation automation for host '%s'\n" % fs.host) # end initial info last_host = fs.host if reserve < fs.used: writer("# Reserve for '%s' is already to low (%s reserved vs %s used), consider increasing its zfs:priority\n" % (fs.url(), int_to_size_string(reserve), int_to_size_string(fs.used))) else: writer("zfs set %s=%s %s\n" % (mode, int_to_size_string(reserve), fs.name)) # end handle reserve issue # end for each record return True
def progress(): elapsed = time() - st log.info( "Processed %i files with %s in %.2fs (%.2f files/s | %s MB/s)", nr, int_to_size_string(totalbcount), elapsed, nr / elapsed, mb(totalbcount) / elapsed, )
def _stream_end(self): """On-demand progress""" super(HashStreamer, self)._stream_end() if self._log: _mb = mb(self.bytes) self._log.info( "Done hashing %s in %.2f s (%.2f MB/s)", int_to_size_string(self.bytes), self.elapsed, _mb / self.elapsed, )
def _handle_chunk(self, chunk): super(HashStreamer, self)._handle_chunk(chunk) self._hasher.update(chunk) if self._compressor: ratio = len(chunk) / float(len(self._compressor(chunk))) # handle first chunk if self.ratio is None: self.ratio = ratio else: self.ratio = (self.ratio + ratio) / 2.0 # end compute average, properly # end handle compression if self._log: self._log.info("Hashed %s", int_to_size_string(self.bytes))
def generate(self): report = self.ReportType(columns=self.report_schema) config = self._sanitize_configuration() record = report.records.append workers = list() def _record_worker_result(): # poll them, as join will block while workers: for w in workers[:]: if w.is_alive(): continue # end ignore unfinished workers self._error |= w.exception is not None record(( w.name, w.elapsed_file_generate_read, mb(w.file_size / (w.elapsed_file_generate_read or 1)), w.elapsed_file_generate_write, mb(w.file_size / (w.elapsed_file_generate_write or 1)), w.elapsed_file_generate, w.elapsed_read_volume, w.read_volume, mb(w.read_volume / (w.elapsed_read_volume or 1)), w.elapsed_write_volume and mb(w.read_volume / w.elapsed_write_volume) or 0, w.exception)) workers.remove(w) # end for each worker sleep(0.5) # end while we have workers to check record(report.aggregate_record()) # end utility print >> sys.stderr, self.configuration() print >> sys.stderr, "Creating %s dataset, and a %s %s volume, in %i threads" % \ (int_to_size_string(config.num_threads * config.file_size), int_to_size_string(config.num_threads * config.random_read_volume), config.random_writes and 'read and write' or 'read', config.num_threads) use_mmap = config.num_threads == 1 if use_mmap: print >> sys.stderr, "Using mmap in single-threaded mode, hoping to perfectly workaround the system's FS cache" # end try: for wid in range(config.num_threads): worker = StressorTerminatableThread(config, use_mmap = use_mmap) workers.append(worker) worker.start() # end for each worker _record_worker_result() except KeyboardInterrupt: print >> sys.stderr, "Sending cancellation request to all workers" for worker in workers: worker.cancel() # end for each worker print >> sys.stderr, "Waiting for workers to finish - they will stop as soon as possible" _record_worker_result() # end handle SIGTERM return report
def _append_path_record(self, records, path, streamer, log, ex_stat=None, digest_ratio=None): """Append meta-data about the given path to the given list of records @param stat if you have received the stat already, we will not get it again @param digest_ratio if not None, we will use the given digest and ration instead of creating our own @return stat structure of the path, or None if the path could not be read""" # minimize file access try: ascii_path = to_ascii(path) stat = ex_stat or lstat(ascii_path) if digest_ratio: digest, ratio = digest_ratio else: digest, ratio = None, None # end handle digest_ratio ldest = None fd = None if islink(stat.st_mode): # Don't follow symlinks as this tricks us into thinking we have duplicates. # Hower, we would also have to check for hardlinks, but tracking those # can easliy cost too much memory. Hardlinks are rare anyway, so its okay. ldest = unicode(readlink(ascii_path)) elif isreg(stat.st_mode) and not digest: fd = os.open(ascii_path, os.O_RDONLY) # end open file except OSError: log.error("Could not stat or open '%s' - skipping", ascii_path, exc_info=False) return None # end skip failing file if fd is not None: try: extra_progress = stat.st_size >= self.big_file if extra_progress: log.info("Streaming %s file at '%s'", int_to_size_string(stat.st_size), ascii_path) # end extra logging try: digest = ( streamer.set_stream(lambda size: os.read(fd, size)) .set_log(extra_progress and log or None) .stream() .digest() ) ratio = streamer.ratio except IOError: log.error("Failed to stream file '%s' - skipping", ascii_path, exc_info=True) return None # end handle io errors gracefully finally: os.close(fd) # end assure we close the file # end handle symlink try: path = unicode(path) except Exception: log.error("Failed to handle encoding of path '%s' - skipping", ascii_path, exc_info=True) return None # end ignore unicode conversion errors # symlinks have a null-digest, which is why they are symlinks. # NOTE: We don't care about their contents, it's just a filename and # we don't has it, as we are not interested about it's contents records.append( { "path": path, "size": stat.st_size, "atime": seconds_to_datetime(stat.st_atime), "ctime": seconds_to_datetime(stat.st_ctime), "mtime": seconds_to_datetime(stat.st_mtime), "uid": stat.st_uid, "gid": stat.st_gid, "nblocks": stat.st_blocks, "nlink": stat.st_nlink, "mode": stat.st_mode, "ldest": ldest, "sha1": digest, "ratio": ratio, } ) return stat
def _build_database(self, config): """@return our database ready to be used. It will be a list of tuples of (prefix, VersionBundleList) pairs Will load from cache or from an sql database (and building the cache in the process)""" if not config.cache_path: if not config.table: raise AssertionError("Please set either db_url and table or the cache_path to specify a data source") config.cache_path = self._cache_path(config.table) print >>sys.stderr, "Would use cache default at %s" % config.cache_path else: print >>sys.stderr, "Will attempt to use cache at %s" % config.cache_path # end handle cache_path db = None # prefer to use a cache if config.cache_path and config.cache_path.isfile(): # LOAD EXISTING CACHE ###################### st = time() db = self._deserialize_db(config.cache_path) elapsed = time() - st cstat = config.cache_path.stat() print >>sys.stderr, "Loaded cache of size %s from %s in %fs (%fMB/s)" % ( int_to_size_string(cstat.st_size), config.cache_path, elapsed, (cstat.st_size / elapsed) / (1024 ** 2), ) # end try loading cache elif config.db_url and config.table: # BUILD CACHE FROM DATABASE ############################ print >>sys.stderr, "reading from database at '%s/%s'" % (config.db_url, config.table) engine = create_engine(config.db_url) mcon = engine.connect() md = MetaData(engine, reflect=True) if config.table not in md.tables: raise AssertionError( "Table named '%s' didn't exist in database at '%s'" % (config.table, config.db_url) ) # end verify table exists progress_every = 40000 def record_iterator(): c = md.tables[config.table].c selector = select( [c.path, c.size, c.ctime, c.mtime, c.mode, c.ratio], (c.ctime != None) & (c.mtime != None) & (c.sha1 != None), ).order_by(c.path) st = time() for rid, row in enumerate(mcon.execute(selector)): if rid % progress_every == 0: elapsed = time() - st print >>sys.stderr, "Read %i records in %fs (%f records/s)" % (rid, elapsed, rid / elapsed) # end handle progress yield (row[0], (row[1], to_s(row[2]), to_s(row[3]), row[4], row[5] or 1.0)) # end for each row # end record iterator st = time() db = FilteringVersionBundler(config).bundle(record_iterator()) print >>sys.stderr, "Extracted version %i bundled in %fs" % (len(db), time() - st) # store cache file st = time() cpath = self._cache_path(config.table) csize = self._serialize_db(db, cpath) elapsed = time() - st print >>sys.stderr, "Wrote cache with size %s to '%s' in %fs (%f MB/s)" % ( int_to_size_string(csize), cpath, elapsed, (csize / elapsed) / 1024 ** 2, ) # end obtain raw database else: raise AssertionError( "Could not build cache database - set db_url and table, cache_path, or table to use a default cache from previous run" ) # end handle cache or db url # DEBUILD RAW CACHE #################### # finally, rebuild and filter into our actual structure st = time() db = FilteringVersionBundler(config).rebuild_bundle(db) def prefix(t): return t[0] # end prefix getter def key_factory(attr): def meta_get(t): return getattr(t[1], attr) return meta_get # end factory # SORT INTO FLAT LIST ##################### key_fun = config.sort_by == self.report_schema[0][0] and prefix or key_factory(config.sort_by) db = sorted(db.iteritems(), key=key_fun, reverse=config.sort_order == self.ORDER_DESC) elapsed = time() - st print >>sys.stderr, "Filtered database in %fs" % elapsed return db