def scan(self, ctx, prev_num): self.compute_stats() # # Check if we have encountered this file during this scan already # ctx.num_visited_files_reporter.increment(1) ctx.current_scanned_file_reporter.set(self.path()) if self.scan_hlink(ctx): logging.info("File %s: HLINK" % self.path()) return # # Check if the file is the same as in one of the upper levels # if self.scan_prev(ctx, prev_num): logging.debug("File %s: PREV" % self.path()) ctx.num_prev_files_reporter.increment(1) return # --- File not yet in database, process it file_size = 0 packer = PackerStream.PackerOStream(self.backup, Container.CODE_DATA) handle = open(self.path(), "rb") for data in FileIO.read_blocks(handle, self.backup.get_block_size()): packer.write(data) file_size += len(data) ctx.num_total_blocks_reporter.increment(1) ctx.size_total_blocks_reporter.increment(len(data)) ctx.update_scan_status() handle.close() self.digest = packer.get_digest() self.level = packer.get_level() self.update_hlink(ctx) logging.info("Scanned file %s size:%d new_blocks:%d new_blocks_size:%d" % (self.path(), file_size, packer.get_num_new_blocks(), packer.get_size_new_blocks())) ctx.num_scanned_files_reporter.increment(1) if packer.get_num_new_blocks() != 0: ctx.num_new_blocks_reporter.increment(packer.get_num_new_blocks()) ctx.size_new_blocks_reporter.increment(packer.get_size_new_blocks()) ctx.num_changed_files_reporter.increment(1) ctx.changed_files_reporter.append(self.path()) if file_size > 256 * 1024: logging.debug("File %s is big enough to register in cndb" % self.path()) cndb = self.backup.get_completed_nodes_db() assert self.stats is not None path_digest = Digest.dataDigest(self.path().encode('utf8')) encoded = (self.digest + IntegerEncodings.binary_encode_int_varlen(self.level) + IntegerEncodings.binary_encode_int_varlen(self.get_type()) + serialize_stats(self.get_stats())) if not cndb.has_key(path_digest) or cndb[path_digest] != encoded: cndb[path_digest] = encoded
def scan_prev(self, ctx, prev_num): """ """ ctx.total_nodes += 1 if prev_num is None: cndb = self.backup.get_completed_nodes_db() path_digest = Digest.dataDigest(self.path().encode('utf8')) if cndb.has_key(path_digest): prev_data_is = StringIO.StringIO(cndb[path_digest]) prev_digest = prev_data_is.read(Digest.dataDigestSize()) prev_level = IntegerEncodings.binary_read_int_varlen(prev_data_is) prev_type = IntegerEncodings.binary_read_int_varlen(prev_data_is) #print "prev_stat_data->", base64.b64encode(prev_data_is.read()) prev_stat = unserialize_stats(prev_data_is) else: ctx.changed_nodes += 1 return False else: prev_type, prev_stat, prev_digest, prev_level = prev_num changed = False if prev_type != self.get_type(): logging.info("node type differs in the db") changed = True #elif (stat.S_IFMT(self.stats[stat.ST_MODE]) != #stat.S_IFMT(prev_stat[stat.ST_MODE])): #print " Node type differs in the fs" #changed = True elif prev_stat is None: logging.info("Base stat not defined") changed = True elif self.stats[stat.ST_INO] != prev_stat[stat.ST_INO]: logging.info("Inode of %s differs: was %d, now %d" % (self.path(), prev_stat[stat.ST_INO], self.stats[stat.ST_INO])) changed = True elif self.stats[stat.ST_MTIME] != prev_stat[stat.ST_MTIME]: logging.info("Mtime of %s differs: %d != %d" % (self.path(), self.stats[stat.ST_MTIME], prev_stat[stat.ST_MTIME])) changed = True elif time.time() - self.stats[stat.ST_MTIME] <= 1.0: # The time from the last change is less than the resolution # of time() functions logging.info("File %s too recent %d : %d" % (self.path(), prev_stat[stat.ST_MTIME], time.time())) changed = True else: # # OK, the prev node seems to be the same as this one. # Reuse it. # self.stats = prev_stat self.digest = prev_digest self.level = prev_level return True #print "changed node", self.path() ctx.changed_nodes += 1 return False
def save_epoch_data(self): # So far, the cache is too resource-intensive. # Avoid keeping it persistently until it's better optimized. return longevity_os = StringIO.StringIO() for digest, longevity in self.block_longevity.iteritems(): longevity_os.write(digest) longevity_os.write(IE.binary_encode_int_varlen(longevity)) epoch = self.block_epoch[digest] longevity_os.write(IE.binary_encode_int_varlen(epoch)) self.block_longevity_data["data"] = longevity_os.getvalue() self.block_longevity_data["epoch"] = str(self.epoch)
def load_epoch_data(self): # So far, the cache is too resource-intensive. # Avoid keeping it persistently. return if not self.block_longevity_data.has_key("epoch"): self.epoch = 0 return self.epoch = int(self.block_longevity_data["epoch"]) longevity_os = StringIO.StringIO(self.block_longevity_data["data"]) while True: digest = longevity_os.read(Digest.dataDigestSize()) if len(digest) == 0: break longevity = IE.binary_read_int_varlen(longevity_os) epoch = IE.binary_read_int_varlen(longevity_os) self.block_longevity[digest] = longevity self.block_epoch[digest] = epoch
def assign_storage_idx(self): storage_idxs = self.get_storage_idxs() if storage_idxs == []: storage_idx = 0 else: storage_idx = max(storage_idxs) + 1 idxs_str = IE.binary_encode_int_varlen_list(storage_idxs + [storage_idx]) self.config_db[self._key("storage_idxs")] = idxs_str return storage_idx
def get_storage_idxs(self): KEY = self._key("storage_idxs") if not self.config_db.has_key(KEY): logging.debug("--- Storage manager knows of no storage idxs") return [] idxs_str = self.config_db[KEY] storage_idxs = IE.binary_decode_int_varlen_list(idxs_str) logging.debug("--- Storage manager knows of idxs: %s" % str(storage_idxs)) return storage_idxs
def update_hlink(self, ctx): if os.name == 'nt': return if self.stats[stat.ST_NLINK] == 1: return inode_num = self.stats[stat.ST_INO] if ctx.inodes_db.has_key(inode_num): return ctx.inodes_db[inode_num] = self.digest +\ IntegerEncodings.binary_encode_int_varlen(self.level)
def register_sequence(self, storage_idx, sequence_id): # Generate new index for this sequence logger_sm.debug( "new sequence detected in storage %d: %s" % (storage_idx, base64.urlsafe_b64encode(sequence_id)) ) sequence_idx = self.next_seq_idx self.next_seq_idx += 1 self.config_db[self._key("next_seq")] = str(self.next_seq_idx) self.config_db[self._key("SEQ." + sequence_id)] = IE.binary_encode_int_varlen_list([storage_idx, sequence_idx]) self.seq_to_index[sequence_id] = (storage_idx, sequence_idx) self.index_to_seq[sequence_idx] = (storage_idx, sequence_id)
def read_directory_entries(self, file): while True: node_type = Format.read_int(file) if node_type is None: raise StopIteration node_name = Format.read_string(file) node_digest = file.read(Digest.dataDigestSize()) node_level = IntegerEncodings.binary_read_int_varlen(file) node_stat = unserialize_stats(file) try: node_name_decoded = unicode(node_name, 'utf8') yield (node_type, node_name_decoded, node_stat, node_digest, node_level) except: logging.info("Encountered bad file name in " + self.path())
def scan_hlink(self, ctx): if os.name == 'nt': # Inode numbers not reported, so we canot detect hard links. return False if self.stats[stat.ST_NLINK] == 1: logging.debug("File %s has NLINK=1, can't be hard link", self.path()) return False inode_num = self.stats[stat.ST_INO] if ctx.inodes_db.has_key(inode_num): self.digest = ctx.inodes_db[inode_num][:Digest.dataDigestSize()] level_str = ctx.inodes_db[inode_num][Digest.dataDigestSize():] self.level = IntegerEncodings.binary_decode_int_varlen(level_str) return True return False
def decode_container_name(name): name_re = re.compile("([^.]+).([^.]+).([^.]+)", re.UNICODE) match = name_re.match(name) if not match: print "Warning: File %s is not a manent container." % name.encode('utf8') return (None, None, None) try: sequence_id = base64.urlsafe_b64decode(match.groups()[0].encode('utf8')) index = IE.ascii_decode_int_varlen(match.groups()[1].encode('utf8')) extension = match.groups()[2] return (sequence_id, index, extension) except: # File name unparseable. Can be junk coming from something else print "Warning: File %s is not a manent container." % name.encode('utf8') return (None, None, None)
def write(self, ctx): """ Write the info of the current dir to database """ packer = PackerStream.PackerOStream(self.backup, Container.CODE_DIR) # sorting is an optimization to make everybody access files in the same # order. # TODO: measure if this really makes things faster # (probably will with a btree db) for child in self.children: Format.write_int(packer, child.get_type()) Format.write_string(packer, child.get_name().encode('utf8')) packer.write(child.get_digest()) packer.write(IntegerEncodings.binary_encode_int_varlen(child.get_level())) stats_str = serialize_stats(child.get_stats()) packer.write(stats_str) self.digest = packer.get_digest() self.level = packer.get_level() return (packer.get_num_new_blocks(), packer.get_size_new_blocks())
def __init__(self, db_manager, txn_manager): self.db_manager = db_manager self.txn_manager = txn_manager self.block_manager = BlockManager.BlockManager(self.db_manager, self.txn_manager) self.block_sequencer = BlockSequencer.BlockSequencer( self.db_manager, self.txn_manager, self, self.block_manager ) self.report_manager = Reporting.DummyReportManager() self.block_listeners = [] self.config_db = db_manager.get_database_btree("config.db", "storage", txn_manager) logging.debug("Loaded storage manager db") for key, val in self.config_db.iteritems(): logging.debug("Storage manager db: [%s]->[%s]" % (base64.b64encode(key), (base64.b64encode(val)))) self.block_container_db = db_manager.get_database_hash("storage.db", "blocks", txn_manager) logging.debug("********** Loaded storage manager logs") for key, val in self.config_db.iteritems(): logging.debug("Storage manager blocks: [%s]->[%s]" % (base64.b64encode(key), base64.b64encode(val))) # Mapping of storage sequences to indices and vice versa # The storage sequence data consists of storage index and sequence # ID string # In the config_db we store the persistent copy of the information # in the seq_to_index and index_to_seq: # repo.%index.seq = sequence # repo.%index.storage = storage index # repo.next_index = <the next index> self.seq_to_index = {} self.index_to_seq = {} NS_KEY = self._key("next_seq") if self.config_db.has_key(NS_KEY): self.next_seq_idx = int(self.config_db[NS_KEY]) else: self.next_seq_idx = 0 SEQ_PREFIX = self._key("SEQ.") for key, val in self.config_db.iteritems_prefix(SEQ_PREFIX): sequence_id = key[len(SEQ_PREFIX) :] storage_idx, sequence_idx = IE.binary_decode_int_varlen_list(val) self.seq_to_index[sequence_id] = (storage_idx, sequence_idx) self.index_to_seq[sequence_idx] = (storage_idx, sequence_id)
def scan(self, ctx, prev_num, exclusion_processor): """Scan the node, considering data in all the previous increments """ logging.debug("Scanning directory " + self.path()) self.compute_stats() ctx.num_visited_dirs_reporter.increment(1) # # Process data from previous increments. # ctx.total_nodes += 1 # prev data indexed by file, for directory scan prev_name_data = {} subdirs = [] # # Fetch prev information of this node # # Find the digest of prev node if it exists prev_digest = None if prev_num is not None: prev_type, prev_stat, prev_digest, prev_level = prev_num if prev_type != NODE_TYPE_DIR: prev_digest = None else: cndb = self.backup.get_completed_nodes_db() path_digest = Digest.dataDigest(self.path().encode('utf8')) if cndb.has_key(path_digest): prev_data_is = StringIO.StringIO(cndb[path_digest]) prev_digest = prev_data_is.read(Digest.dataDigestSize()) prev_level = IntegerEncodings.binary_read_int_varlen(prev_data_is) prev_type = IntegerEncodings.binary_read_int_varlen(prev_data_is) #print "prev_stat_data->", base64.b64encode(prev_data_is.read()) prev_stat = unserialize_stats(prev_data_is) if prev_type != self.get_type(): logging.debug("Node from cndb is not a directory!") prev_digest = None # Load the data of the prev node if prev_digest is not None: dir_stream = PackerStream.PackerIStream(self.backup, prev_digest, prev_level) for node_type, node_name, node_stat, node_digest, node_level in\ self.read_directory_entries(dir_stream): if node_type == NODE_TYPE_DIR: subdirs.append(node_name) prev_name_data[node_name] = ((node_type, node_stat, node_digest, node_level)) # # Scan the directory # exclusion_processor.filter_files() # Initialize scanning data self.children = [] num_children = len(exclusion_processor.get_included_files() + exclusion_processor.get_included_dirs()) processed_children = 0.0 # Scan the files in the directory for name in exclusion_processor.get_included_files(): path = os.path.join(self.path(), name) file_mode = os.lstat(path)[stat.ST_MODE] if prev_name_data.has_key(name): cur_prev = prev_name_data[name] else: cur_prev = None try: if stat.S_ISLNK(file_mode): node = Symlink(self.backup, self, name) node.scan(ctx, cur_prev) self.children.append(node) elif stat.S_ISREG(file_mode): node = File(self.backup, self, name) node.scan(ctx, cur_prev) self.children.append(node) else: ctx.unrecognized_files_reporter.append(path) logging.info("Ignoring unrecognized file type " + path) except OSError: logging.info("OSError accessing " + path) ctx.oserror_files_reporter.append(path) # traceback.print_exc() except IOError, (errno, strerror): logging.info("IOError %s accessing '%s' %s" % (errno, strerror, path)) ctx.ioerror_files_reporter.append(path) # traceback.print_exc() finally:
def _encode_block_info(seq_idx, container_idx): io = StringIO.StringIO() io.write(IE.binary_encode_int_varlen(seq_idx)) io.write(IE.binary_encode_int_varlen(container_idx)) return io.getvalue()
def _decode_block_info(encoded): io = StringIO.StringIO(encoded) seq_idx = IE.binary_read_int_varlen(io) container_idx = IE.binary_read_int_varlen(io) return (seq_idx, container_idx)
# # Update the current dir in completed_nodes_db # cndb = self.backup.get_completed_nodes_db() for subdir in subdirs: subdir_path = os.path.join(self.path(), subdir) subdir_path_digest = Digest.dataDigest(subdir_path.encode('utf8')) if cndb.has_key(subdir_path_digest): del cndb[subdir_path_digest] if self.stats is not None: # Stats are empty for the root node, but we don't want to store # it in the cndb, because at this point we're already done with the # increment anyway digest = Digest.dataDigest(self.path().encode('utf8')) encoded = (self.digest + IntegerEncodings.binary_encode_int_varlen(self.level) + IntegerEncodings.binary_encode_int_varlen(self.get_type()) + serialize_stats(self.get_stats())) if not cndb.has_key(digest) or cndb[digest] != encoded: cndb[digest] = encoded if self.digest != prev_digest: #print "changed node", self.path() ctx.changed_nodes += 1 def get_percent_done(self): if self.cur_scanned_child is None: return self.weight * self.processed_percent else: return (self.weight * self.processed_percent +
def encode_container_name(sequence_id, index, extension): return "%s.%s.%s" % (base64.urlsafe_b64encode(sequence_id), IE.ascii_encode_int_varlen(index), extension)