def _handle_possibly_changed_package(self, lhs_package, rhs_package, modified): """Updates an SQL database with changes and schedules new operations to handle those packages""" session = self.thread_local.session super(SQLPackageDifferMixin, self)._handle_possibly_changed_package(lhs_package, rhs_package, modified) if modified: log.info("%s changed", rhs_package) else: log.debug("%s unchanged, stable since %i", rhs_package, rhs_package.stable_since()) # end handle unchanged sql_package = None if modified: # Update the database with the stable time of the rhs package, we have changed and must mark it sql_package = session.to_sql_package(rhs_package, rhs_package.stable_since()) sql_package.stable_since = seconds_to_datetime(rhs_package.stable_since()) for trans in self._unfinished_transactions_for(session, sql_package.id): trans.cancel() trans.comment = "canceled as input package was changed before transaction was queued" # end for each transaction to cancel session.commit() # end pass off the package handling as it could be stable # We have to re-check every package to check if we can schedule a job on it self._handle_possibly_stable_package(rhs_package, session, sql_package)
def _dict_to_bundle_list(self, prefix, bundle_dict): """Assure we apply retention per-version-bundle list""" if not self.config.retention_policy and self.config.keep_latest_version_count < 0: bundle_list = super(FilteringVersionBundler, self)._dict_to_bundle_list(prefix, bundle_dict) else: # MARK BUNDLES FOR DELETION ########################### # NOTE: When using the policy, it is very important that newer versions are also newer regarding the date. # This is why we resort to the min_created attribute, the youngest item counts (just in case people overwrite versions) bundle_list = self.BundleListType() if self.config.retention_policy: samples, removed_samples = self.config.retention_policy.filter( time(), ((seconds_to_datetime(b.min_created), b) for b in self._iter_bundles_in_dict(bundle_dict)), ordered=False, ) for t, b in removed_samples: b.removed = True # end for each sample bundle_list.extend(sorted((s[1] for s in chain(samples, removed_samples)), key=lambda b: b.version)) else: bundle_list.extend(self._iter_bundles_in_dict(bundle_dict)) bundle_list.sort(key=lambda b: b.version) # can be negative, yielding nothing to iterate on for vid in xrange(len(bundle_list) - self.config.keep_latest_version_count): bundle_list[vid].removed = True # end for each version to remove # end handle policy or stupid keep count # end handle bundle list conversion bundle_list.prefix = prefix return bundle_list
def _handle_added_package(self, rhs_package): """Called to handle if package was added, compared to the last incarnation if it's parent tree""" session = self.thread_local.session log.info("%s managed", rhs_package) # Will possibly create a new instance ... sql_package = session.to_sql_package(rhs_package, rhs_package.stable_since()) # NOTE: Could be a package which is currently being moved ... which just means it's not stable and will # seen be put under 'new' management. # It can be that the daemon just restarted, but the package existed already with are more useful stable_since # date. We will use the one from the database in that case if sql_package.stable_since < seconds_to_datetime(rhs_package.stable_since()): rhs_package.set_stable_since(datetime_to_seconds(sql_package.stable_since)) # end handle time conversion # in any case, commit the changes right now (possible addition, changes) session.commit() # See if we can handle the package already self._handle_possibly_stable_package(rhs_package, session, sql_package)
def sample(age): return seconds_to_datetime(now - age), None
def _append_path_record(self, records, path, streamer, log, ex_stat=None, digest_ratio=None): """Append meta-data about the given path to the given list of records @param stat if you have received the stat already, we will not get it again @param digest_ratio if not None, we will use the given digest and ration instead of creating our own @return stat structure of the path, or None if the path could not be read""" # minimize file access try: ascii_path = to_ascii(path) stat = ex_stat or lstat(ascii_path) if digest_ratio: digest, ratio = digest_ratio else: digest, ratio = None, None # end handle digest_ratio ldest = None fd = None if islink(stat.st_mode): # Don't follow symlinks as this tricks us into thinking we have duplicates. # Hower, we would also have to check for hardlinks, but tracking those # can easliy cost too much memory. Hardlinks are rare anyway, so its okay. ldest = unicode(readlink(ascii_path)) elif isreg(stat.st_mode) and not digest: fd = os.open(ascii_path, os.O_RDONLY) # end open file except OSError: log.error("Could not stat or open '%s' - skipping", ascii_path, exc_info=False) return None # end skip failing file if fd is not None: try: extra_progress = stat.st_size >= self.big_file if extra_progress: log.info("Streaming %s file at '%s'", int_to_size_string(stat.st_size), ascii_path) # end extra logging try: digest = ( streamer.set_stream(lambda size: os.read(fd, size)) .set_log(extra_progress and log or None) .stream() .digest() ) ratio = streamer.ratio except IOError: log.error("Failed to stream file '%s' - skipping", ascii_path, exc_info=True) return None # end handle io errors gracefully finally: os.close(fd) # end assure we close the file # end handle symlink try: path = unicode(path) except Exception: log.error("Failed to handle encoding of path '%s' - skipping", ascii_path, exc_info=True) return None # end ignore unicode conversion errors # symlinks have a null-digest, which is why they are symlinks. # NOTE: We don't care about their contents, it's just a filename and # we don't has it, as we are not interested about it's contents records.append( { "path": path, "size": stat.st_size, "atime": seconds_to_datetime(stat.st_atime), "ctime": seconds_to_datetime(stat.st_ctime), "mtime": seconds_to_datetime(stat.st_mtime), "uid": stat.st_uid, "gid": stat.st_gid, "nblocks": stat.st_blocks, "nlink": stat.st_nlink, "mode": stat.st_mode, "ldest": ldest, "sha1": digest, "ratio": ratio, } ) return stat
def _fast_update_database(self, engine, args): """Update all data contained in the given engine quickly, see --fast @return number of processed records""" nr = 0 st = time() log = self.log() progress_every = 5000 stats_info_every = 500 commit_every_seconds = 30 commit_every_records = 15000 time_of_last_commit = time() connection = engine.connect() meta = MetaData(engine, reflect=True) fsitem = meta.tables[args.table_name] insert = fsitem.insert() update = ( fsitem.update() .where(fsitem.c.id == bindparam("rid")) .values( path=bindparam("path"), size=bindparam("size"), atime=bindparam("atime"), ctime=bindparam("ctime"), mtime=bindparam("mtime"), uid=bindparam("uid"), gid=bindparam("gid"), nblocks=bindparam("nblocks"), nlink=bindparam("nlink"), mode=bindparam("mode"), ldest=bindparam("ldest"), sha1=bindparam("sha1"), ratio=bindparam("ratio"), ) ) # NOTE: this selector assures we only get the latest version of a file, based on the modification time ! selector = select( [ fsitem.c.id, fsitem.c.path, fsitem.c.size, fsitem.c.atime, fsitem.c.ctime, # marker to see if something is deleted fsitem.c.mtime, fsitem.c.uid, fsitem.c.gid, fsitem.c.nblocks, fsitem.c.nlink, fsitem.c.mode, fsitem.c.ldest, fsitem.c.sha1, fsitem.c.ratio, ], order_by=[fsitem.c.path, fsitem.c.id.desc()], ) if args.where_like: selector = selector.where(fsitem.c.path.like(args.where_like + "%")) # end append where clause def progress(): elapsed = time() - st log.info("Checked %i files in %.2fs (%.2f files/s)", nr, elapsed, nr / elapsed) # end join = os.path.join isabs = os.path.isabs dirname = os.path.dirname basename = os.path.basename streamer = HashStreamer(hashlib.sha1, lz4dumps) ## A mapping from directory names to all of its files (as names) dir_entries = dict() # A list of sql operators that will update particular entries. They are executed all at once # Must include the ID updates = list() total_num_updates = 0 modified_count = 0 added_count = 0 deleted_count = 0 last_path = None # The window is critical - it is slow for the server, and each query is like a new complete query # where only a subset is sent (due to the ordering) # Additionally, if there are many changes, we will change the database during iteration, which will # basically give us part of the same files (if not the same files) back on the next query, which # makes us even more inefficient. Therefore we use memory to our advantage, and use 1mio entries # by default. This needs about 1GB of memory, but reduces the amount of queries considerably # especially on large database window = 1000 * 1000 cur_window = 0 shortest_path = None len_shortest_path = 100000000 for cursor in self._fetch_record_iterator(connection, selector, window): nri = 0 # num rows in iteration for row in cursor: # NOTE: We are getting multiple entries, sorted by the latest one, for the same path # We prune all paths of a kind have seen so far # Can be files or directories nri += 1 nr += 1 rid, path, size, atime, ctime, mtime, uid, gid, nblocks, nlink, mode, ldest, sha1, ratio = row if not isabs(path) or path == last_path: continue # end skip relative paths ! last_path = path ascii_path = to_ascii(path) # NOTE: I know, this is killing us, as we will grow rather large by keeping all that data # But I know no other way except for processing directories while we are going. # As files and directories will be mixed, it is not too easy though to figure this out. # For now, we just go for it and let the CPU/Memory burn directory = dirname(path) if directory not in dir_entries: dir_entries[directory] = set() # end count dirs dir_entries[directory].add(basename(path)) # Make sure we don't forget to set the actual directory - otherwise if isdir(mode): dir_entries.setdefault(path, set()) # end add each directory that is a directory # Find the root path, which should be the origin of it all, and ignore it when # finding added items. It's definitely the shortest one if len(directory) < len_shortest_path: shortest_path = directory len_shortest_path = len(directory) # end keep shortest path try: # For some reason, this doesn't get our unicode as it tries to use ascii to deal with it # NOTE: We could know the file was deleted by checking fsitem.c.ctime is None, but # we check anyway because it could be re-created. stat = lstat(ascii_path) except OSError: # DELETION ########## # This marks a deletion - we just keep the time of deletion, which is the time when we # noticed it ! Not the actual one # It didn't exist, but only append this info if we didn't know about that before if ctime is not None: # have to write an entire record, otherwise changes and deletions go out of sync updates.append( { "rid": rid, "path": path, "size": 0, "atime": atime, "ctime": None, "mtime": seconds_to_datetime(time()), "uid": uid, "gid": gid, "nblocks": nblocks, "nlink": nlink, "mode": mode, "ldest": ldest, # Keep sha as last known contents ! This allows to track deletion even # renames and deletions "sha1": sha1, "ratio": ratio, } ) deleted_count += 1 if deleted_count % stats_info_every == 0: log.info("Found %i DELETED paths", deleted_count) # end handle deleted # end handle deletions else: # MODIFICATION ############### # File could have been deleted and re-created # We don't know it was an addition (due to previous deletion), but the dataset is the same # so people can figure it out later # ordered by likeliness if ( seconds_to_datetime(stat.st_mtime) != mtime or size != stat.st_size or uid != stat.st_uid or gid != stat.st_gid or mode != stat.st_mode or nlink != stat.st_nlink or (islink(stat.st_mode) and readlink(ascii_path) != ldest) ): # NOTE: we are lazy here and say, for now, that the size must change to justify # taking another sha. Otherwise we assume that it's just any other change, which we will # put into the database in the form of a new commit, of course. if self._append_path_record( updates, path, streamer, log, stat, size == stat.st_size and (sha1, ratio) or None ): # add the rid to have everything we need for the update updates[-1]["rid"] = rid modified_count += 1 if modified_count % stats_info_every == 0: log.info("Found %i MODIFIED paths", modified_count) # end show information # end handle modification # end handle modification # end handle deleted file if nr % progress_every == 0: progress() # end handle progress if len(updates) >= commit_every_records or time() - time_of_last_commit >= commit_every_seconds: total_num_updates += len(updates) self.do_execute_records(connection, update, updates, log, st, total_num_updates) time_of_last_commit = time() # end handle executions # end for each file in database windows cursor.close() # Is the database depleted ? if nri < window: break # end handle window # end for each cursor progress() total_num_updates += len(updates) self.do_execute_records(connection, update, updates, log, st, total_num_updates) ######################## # HANDLE ADDITIONS ### #################### # We iterate all actual directories and their entries as known to the database # Now we just have to compare and only check for additions new_records = list() def list_dir_safely(dir_ascii): """@return entries of an empty tuple() if the listing failed""" try: return os.listdir(dir_ascii) except OSError: # ignore added dirs which might already be gone log.warn("Couldn't access '%s' when trying to add it", dir_ascii) return tuple() # end handle exception # We can't assign a variable in an outside scope, so we have to make it an array last_commit_time = [time()] def append_records_recursive(path, added_count): """Find all entries recursively in path and append them @param path directory or path @return amount of added items""" # no matter what, add the entry if self._append_path_record(new_records, path, streamer, log): added_count += 1 if added_count % stats_info_every == 0: log.info("Found %i ADDED paths", added_count) # end info printing if len(new_records) >= commit_every_records or time() - last_commit_time[0] >= commit_every_seconds: self.do_execute_records(connection, insert, new_records, log, st, added_count) last_commit_time[0] = time() # end handle path path_ascii = to_ascii(path) if os.path.isdir(path_ascii): entries = list_dir_safely(path_ascii) for entry in entries: added_count = append_records_recursive(join(path, entry), added_count) # end for each entry to check # end entries return added_count # end recursion helper # Remove shortest directory, which was generated from the directory of our root ! # NOTE: if there was no root, this is false alarm try: del (dir_entries[shortest_path]) except KeyError: pass # end ignore root not in dirlist log.info("About to check %i directories for added entries ...", len(dir_entries)) for dir, entries in dir_entries.iteritems(): added = set(list_dir_safely(to_ascii(dir))) - entries for added_entry in added: added_count = append_records_recursive(join(dir, added_entry), added_count) # end for each directory to check if new_records: log.info("Committing remaining %i new records", len(new_records)) self.do_execute_records(connection, insert, new_records, log, st, added_count) # end commit new records connection.close() elapsed = time() - st log.info("== Statistics ==") log.info("%5i ADDED", added_count) log.info("%5i MODIFIED", modified_count) log.info("%5i DELETED", deleted_count) log.info("================") log.info( "Updated %i entries in %.2fs (%.2f entries/s)", total_num_updates, elapsed, total_num_updates / elapsed ) return nr
def generate(self): now = datetime.now() now_time = time() rep = self.ReportType() rep.columns.extend(self.report_schema) policy_string = self._config.policy name_like = self._config.name_like applied_every_string = self._config.applied_every debug = self._config.debug if not policy_string: # todo find it from filesystem property log.error('Retention policy is not configured') return rep # end ignore empty retention if not name_like: log.error("Please specify the name_like to be the name of the file system, like '%foo%'") return rep # end handle name filter not set policy = self.PolicyType(policy_string) applied_every_string = applied_every_string or None # Find all snapshots ascending by creation date and query = self._session.query(ZDataset).\ filter(ZDataset.avail == None).\ filter(ZDataset.name.like(self._config.name_like)).\ order_by(ZDataset.host, ZDataset.creation) # sort all results by filesystem by_fs_map = dict() for ss in host_filter(self._config.hosts, ZDataset.host, query): by_fs_map.setdefault((ss.host, ss.filesystem_name()), list()).append((ss.creation, ss)) # end for each dataset def count_samples_in_range(samples, from_date, to_date): count = 0 for ctime, _ in samples: if from_date < ctime < to_date: count += 1 elif count: break # end handle early bailout # end for each sample return count # end brute force count samples utility, doesn't make assumptions about order kept_comment = 'kept by policy' removed_comment = 'removed by policy' summaries = list() # summary-records for (fs_host, fs_name), samples in by_fs_map.iteritems(): # Apply policy and prepare actual report remaining, deleted = policy.filter(now_time, samples) # in debug mode, we want to see it even there are no deletions # Otherwise this is just a shortcut if not debug and not deleted: continue # end handle empty list if debug: merged_records = list() dset = set(deleted) for sample in samples: is_deleted = sample in dset ctime, ss = sample rep.records.append([ now - ss.updated_at, fs_host, ss.name, is_deleted and self.TYPE_SNAPSHOT or 'debug', ss.creation, now - ss.creation, ss.used, 0, 100.0, is_deleted and removed_comment or kept_comment]) # end for each sample # Convert rules into format that is more easily understood: num-samples:date-ago rule_tokens = list() total_duration = 0 to_date = now for keep, freq, duration in policy._rules: total_duration += duration from_date = seconds_to_datetime(now_time - total_duration) remaining_count = count_samples_in_range(remaining, from_date, to_date) del_count = count_samples_in_range(deleted, from_date, to_date) rule = '(%i-%i=%i)/%i:%s' % (remaining_count + del_count, del_count, remaining_count, duration / freq, delta_to_tty_string(now - from_date)) rule_tokens.append(rule) to_date = from_date # end for each rule summaries.append([now-now, fs_host, fs_name, 'debug-' + self.TYPE_SUMMARY, now, now - now, 0, 0, 0, ','.join(rule_tokens) ]) # end adjust record source for debugging else: for creation_time, ss in deleted: rep.records.append([ now - ss.updated_at, fs_host, ss.name, self.TYPE_SNAPSHOT, ss.creation, now - ss.creation, ss.used, 0, 100.0, removed_comment]) # end handle debug summary = "%s - Removing %i of %i snapshots; %i remain, policy-max = %i (+%i)" % ((policy_string, len(deleted), len(samples), len(remaining)) + policy.num_rule_samples(applied_every_string)) summaries.append([now-now, fs_host, fs_name, self.TYPE_SUMMARY, now, now - now, 0, 0, 0, summary ]) # end for each filesystem # AGGREGATE ########### self._aggregate_records(rep.records, now) rep.records.extend(summaries) return rep
def seconds_to_delta_string(date_seconds): """@return a string representing the given time (in past) in seconds in time relative to current time""" return delta_to_tty_string(now - seconds_to_datetime(date_seconds))