Example #1
0
  def scan(self, ctx, prev_num):
    self.compute_stats()
    #
    # Check if we have encountered this file during this scan already
    #
    ctx.num_visited_files_reporter.increment(1)
    ctx.current_scanned_file_reporter.set(self.path())

    if self.scan_hlink(ctx):
      logging.info("File %s: HLINK" % self.path())
      return

    #
    # Check if the file is the same as in one of the upper levels
    #
    if self.scan_prev(ctx, prev_num):
      logging.debug("File %s: PREV" % self.path())
      ctx.num_prev_files_reporter.increment(1)
      return
    
    # --- File not yet in database, process it
    file_size = 0
    packer = PackerStream.PackerOStream(self.backup, Container.CODE_DATA)
    handle = open(self.path(), "rb")
    for data in FileIO.read_blocks(handle, self.backup.get_block_size()):
      packer.write(data)
      file_size += len(data)
      ctx.num_total_blocks_reporter.increment(1)
      ctx.size_total_blocks_reporter.increment(len(data))
      ctx.update_scan_status()
    handle.close()
      
    self.digest = packer.get_digest()
    self.level = packer.get_level()
    self.update_hlink(ctx)

    logging.info("Scanned file %s size:%d new_blocks:%d new_blocks_size:%d" %
        (self.path(), file_size, packer.get_num_new_blocks(),
          packer.get_size_new_blocks()))

    ctx.num_scanned_files_reporter.increment(1)
    if packer.get_num_new_blocks() != 0:
      ctx.num_new_blocks_reporter.increment(packer.get_num_new_blocks())
      ctx.size_new_blocks_reporter.increment(packer.get_size_new_blocks())
      ctx.num_changed_files_reporter.increment(1)
      ctx.changed_files_reporter.append(self.path())

    if file_size > 256 * 1024:
      logging.debug("File %s is big enough to register in cndb" %
          self.path())
      cndb = self.backup.get_completed_nodes_db()
      assert self.stats is not None
      path_digest = Digest.dataDigest(self.path().encode('utf8'))
      encoded = (self.digest +
          IntegerEncodings.binary_encode_int_varlen(self.level) +
          IntegerEncodings.binary_encode_int_varlen(self.get_type()) +
          serialize_stats(self.get_stats()))

      if not cndb.has_key(path_digest) or cndb[path_digest] != encoded:
        cndb[path_digest] = encoded
Example #2
0
  def scan_prev(self, ctx, prev_num):
    """
    """
    ctx.total_nodes += 1
    if prev_num is None:
      cndb = self.backup.get_completed_nodes_db()
      path_digest = Digest.dataDigest(self.path().encode('utf8'))
      if cndb.has_key(path_digest):
        prev_data_is = StringIO.StringIO(cndb[path_digest])
        prev_digest = prev_data_is.read(Digest.dataDigestSize())
        prev_level = IntegerEncodings.binary_read_int_varlen(prev_data_is)
        prev_type = IntegerEncodings.binary_read_int_varlen(prev_data_is)
        #print "prev_stat_data->", base64.b64encode(prev_data_is.read())
        prev_stat = unserialize_stats(prev_data_is)
      else:
        ctx.changed_nodes += 1
        return False
    else:
      prev_type, prev_stat, prev_digest, prev_level = prev_num

    changed = False

    if prev_type != self.get_type():
      logging.info("node type differs in the db")
      changed = True
    #elif (stat.S_IFMT(self.stats[stat.ST_MODE]) !=
           #stat.S_IFMT(prev_stat[stat.ST_MODE])):
      #print "  Node type differs in the fs"
      #changed = True
    elif prev_stat is None:
      logging.info("Base stat not defined")
      changed = True
    elif self.stats[stat.ST_INO] != prev_stat[stat.ST_INO]:
      logging.info("Inode of %s differs: was %d, now %d" %
        (self.path(), prev_stat[stat.ST_INO], self.stats[stat.ST_INO]))
      changed = True
    elif self.stats[stat.ST_MTIME] != prev_stat[stat.ST_MTIME]:
      logging.info("Mtime of %s differs: %d != %d" %
        (self.path(), self.stats[stat.ST_MTIME], prev_stat[stat.ST_MTIME]))
      changed = True
    elif time.time() - self.stats[stat.ST_MTIME] <= 1.0:
      # The time from the last change is less than the resolution
      # of time() functions
      logging.info("File %s too recent %d : %d" %
          (self.path(), prev_stat[stat.ST_MTIME], time.time()))
      changed = True
    else:
      #
      # OK, the prev node seems to be the same as this one.
      # Reuse it.
      #
      self.stats = prev_stat
      self.digest = prev_digest
      self.level = prev_level
      return True

    #print "changed node", self.path()
    ctx.changed_nodes += 1
    return False
Example #3
0
 def save_epoch_data(self):
   # So far, the cache is too resource-intensive.
   # Avoid keeping it persistently until it's better optimized.
   return
   longevity_os = StringIO.StringIO()
   for digest, longevity in self.block_longevity.iteritems():
     longevity_os.write(digest)
     longevity_os.write(IE.binary_encode_int_varlen(longevity))
     epoch = self.block_epoch[digest]
     longevity_os.write(IE.binary_encode_int_varlen(epoch))
   self.block_longevity_data["data"] = longevity_os.getvalue()
   self.block_longevity_data["epoch"] = str(self.epoch)
Example #4
0
 def load_epoch_data(self):
   # So far, the cache is too resource-intensive.
   # Avoid keeping it persistently.
   return
   if not self.block_longevity_data.has_key("epoch"):
     self.epoch = 0
     return
   self.epoch = int(self.block_longevity_data["epoch"])
   longevity_os = StringIO.StringIO(self.block_longevity_data["data"])
   while True:
     digest = longevity_os.read(Digest.dataDigestSize())
     if len(digest) == 0:
       break
     longevity = IE.binary_read_int_varlen(longevity_os)
     epoch = IE.binary_read_int_varlen(longevity_os)
     self.block_longevity[digest] = longevity
     self.block_epoch[digest] = epoch
Example #5
0
 def assign_storage_idx(self):
     storage_idxs = self.get_storage_idxs()
     if storage_idxs == []:
         storage_idx = 0
     else:
         storage_idx = max(storage_idxs) + 1
     idxs_str = IE.binary_encode_int_varlen_list(storage_idxs + [storage_idx])
     self.config_db[self._key("storage_idxs")] = idxs_str
     return storage_idx
Example #6
0
 def get_storage_idxs(self):
     KEY = self._key("storage_idxs")
     if not self.config_db.has_key(KEY):
         logging.debug("--- Storage manager knows of no storage idxs")
         return []
     idxs_str = self.config_db[KEY]
     storage_idxs = IE.binary_decode_int_varlen_list(idxs_str)
     logging.debug("--- Storage manager knows of idxs: %s" % str(storage_idxs))
     return storage_idxs
Example #7
0
 def update_hlink(self, ctx):
   if os.name == 'nt':
     return
   if self.stats[stat.ST_NLINK] == 1:
     return
   inode_num = self.stats[stat.ST_INO]
   if ctx.inodes_db.has_key(inode_num):
     return
   ctx.inodes_db[inode_num] = self.digest +\
     IntegerEncodings.binary_encode_int_varlen(self.level)
Example #8
0
    def register_sequence(self, storage_idx, sequence_id):
        # Generate new index for this sequence
        logger_sm.debug(
            "new sequence detected in storage %d: %s" % (storage_idx, base64.urlsafe_b64encode(sequence_id))
        )
        sequence_idx = self.next_seq_idx
        self.next_seq_idx += 1
        self.config_db[self._key("next_seq")] = str(self.next_seq_idx)
        self.config_db[self._key("SEQ." + sequence_id)] = IE.binary_encode_int_varlen_list([storage_idx, sequence_idx])

        self.seq_to_index[sequence_id] = (storage_idx, sequence_idx)
        self.index_to_seq[sequence_idx] = (storage_idx, sequence_id)
Example #9
0
 def read_directory_entries(self, file):
   while True:
     node_type = Format.read_int(file)
     if node_type is None:
       raise StopIteration
     node_name = Format.read_string(file)
     node_digest = file.read(Digest.dataDigestSize())
     node_level = IntegerEncodings.binary_read_int_varlen(file)
     node_stat = unserialize_stats(file)
     try:
       node_name_decoded = unicode(node_name, 'utf8')
       yield (node_type, node_name_decoded, node_stat, node_digest, node_level)
     except:
       logging.info("Encountered bad file name in " + self.path())
Example #10
0
 def scan_hlink(self, ctx):
   if os.name == 'nt':
     # Inode numbers not reported, so we canot detect hard links.
     return False
   if self.stats[stat.ST_NLINK] == 1:
     logging.debug("File %s has NLINK=1, can't be hard link", self.path())
     return False
   inode_num = self.stats[stat.ST_INO]
   if ctx.inodes_db.has_key(inode_num):
     self.digest = ctx.inodes_db[inode_num][:Digest.dataDigestSize()]
     level_str = ctx.inodes_db[inode_num][Digest.dataDigestSize():]
     self.level = IntegerEncodings.binary_decode_int_varlen(level_str)
     return True
   return False
Example #11
0
def decode_container_name(name):
  name_re = re.compile("([^.]+).([^.]+).([^.]+)", re.UNICODE)
  match = name_re.match(name)
  if not match:
    print "Warning: File %s is not a manent container." % name.encode('utf8')
    return (None, None, None)
  try:
    sequence_id = base64.urlsafe_b64decode(match.groups()[0].encode('utf8'))
    index = IE.ascii_decode_int_varlen(match.groups()[1].encode('utf8'))
    extension = match.groups()[2]
    return (sequence_id, index, extension)
  except:
    # File name unparseable. Can be junk coming from something else
    print "Warning: File %s is not a manent container." % name.encode('utf8')
    return (None, None, None)
Example #12
0
 def write(self, ctx):
   """
   Write the info of the current dir to database
   """
   packer = PackerStream.PackerOStream(self.backup, Container.CODE_DIR)
   # sorting is an optimization to make everybody access files in the same
   # order.
   # TODO: measure if this really makes things faster
   # (probably will with a btree db)
   for child in self.children:
     Format.write_int(packer, child.get_type())
     Format.write_string(packer, child.get_name().encode('utf8'))
     packer.write(child.get_digest())
     packer.write(IntegerEncodings.binary_encode_int_varlen(child.get_level()))
     stats_str = serialize_stats(child.get_stats())
     packer.write(stats_str)
   
   self.digest = packer.get_digest()
   self.level = packer.get_level()
   return (packer.get_num_new_blocks(), packer.get_size_new_blocks())
Example #13
0
    def __init__(self, db_manager, txn_manager):
        self.db_manager = db_manager
        self.txn_manager = txn_manager
        self.block_manager = BlockManager.BlockManager(self.db_manager, self.txn_manager)
        self.block_sequencer = BlockSequencer.BlockSequencer(
            self.db_manager, self.txn_manager, self, self.block_manager
        )
        self.report_manager = Reporting.DummyReportManager()
        self.block_listeners = []

        self.config_db = db_manager.get_database_btree("config.db", "storage", txn_manager)
        logging.debug("Loaded storage manager db")
        for key, val in self.config_db.iteritems():
            logging.debug("Storage manager db: [%s]->[%s]" % (base64.b64encode(key), (base64.b64encode(val))))
        self.block_container_db = db_manager.get_database_hash("storage.db", "blocks", txn_manager)
        logging.debug("********** Loaded storage manager logs")
        for key, val in self.config_db.iteritems():
            logging.debug("Storage manager blocks: [%s]->[%s]" % (base64.b64encode(key), base64.b64encode(val)))

        # Mapping of storage sequences to indices and vice versa
        # The storage sequence data consists of storage index and sequence
        # ID string
        # In the config_db we store the persistent copy of the information
        # in the seq_to_index and index_to_seq:
        # repo.%index.seq = sequence
        # repo.%index.storage = storage index
        # repo.next_index = <the next index>
        self.seq_to_index = {}
        self.index_to_seq = {}
        NS_KEY = self._key("next_seq")
        if self.config_db.has_key(NS_KEY):
            self.next_seq_idx = int(self.config_db[NS_KEY])
        else:
            self.next_seq_idx = 0
        SEQ_PREFIX = self._key("SEQ.")
        for key, val in self.config_db.iteritems_prefix(SEQ_PREFIX):
            sequence_id = key[len(SEQ_PREFIX) :]
            storage_idx, sequence_idx = IE.binary_decode_int_varlen_list(val)
            self.seq_to_index[sequence_id] = (storage_idx, sequence_idx)
            self.index_to_seq[sequence_idx] = (storage_idx, sequence_id)
Example #14
0
  def scan(self, ctx, prev_num, exclusion_processor):
    """Scan the node, considering data in all the previous increments
    """
    logging.debug("Scanning directory " + self.path())
    self.compute_stats()
    ctx.num_visited_dirs_reporter.increment(1)
    #
    # Process data from previous increments.
    #
    ctx.total_nodes += 1
    # prev data indexed by file, for directory scan
    prev_name_data = {}
    subdirs = []

    #
    # Fetch prev information of this node
    #
    # Find the digest of prev node if it exists
    prev_digest = None
    if prev_num is not None:
      prev_type, prev_stat, prev_digest, prev_level = prev_num
      if prev_type != NODE_TYPE_DIR:
        prev_digest = None
    else:
      cndb = self.backup.get_completed_nodes_db()
      path_digest = Digest.dataDigest(self.path().encode('utf8'))
      if cndb.has_key(path_digest):
        prev_data_is = StringIO.StringIO(cndb[path_digest])
        prev_digest = prev_data_is.read(Digest.dataDigestSize())
        prev_level = IntegerEncodings.binary_read_int_varlen(prev_data_is)
        prev_type = IntegerEncodings.binary_read_int_varlen(prev_data_is)
        #print "prev_stat_data->", base64.b64encode(prev_data_is.read())
        prev_stat = unserialize_stats(prev_data_is)
        if prev_type != self.get_type():
          logging.debug("Node from cndb is not a directory!")
          prev_digest = None
    # Load the data of the prev node
    if prev_digest is not None:
      dir_stream = PackerStream.PackerIStream(self.backup, prev_digest,
        prev_level)
      for node_type, node_name, node_stat, node_digest, node_level in\
            self.read_directory_entries(dir_stream):
        if node_type == NODE_TYPE_DIR:
          subdirs.append(node_name)
        prev_name_data[node_name] = ((node_type, node_stat,
                                      node_digest, node_level))

    #
    # Scan the directory
    #
    exclusion_processor.filter_files()

    # Initialize scanning data
    self.children = []
    num_children = len(exclusion_processor.get_included_files() +
        exclusion_processor.get_included_dirs())
    processed_children = 0.0

    # Scan the files in the directory
    for name in exclusion_processor.get_included_files():
      path = os.path.join(self.path(), name)
      file_mode = os.lstat(path)[stat.ST_MODE]

      if prev_name_data.has_key(name):
        cur_prev = prev_name_data[name]
      else:
        cur_prev = None

      try:
        if stat.S_ISLNK(file_mode):
          node = Symlink(self.backup, self, name)
          node.scan(ctx, cur_prev)
          self.children.append(node)
        elif stat.S_ISREG(file_mode):
          node = File(self.backup, self, name)
          node.scan(ctx, cur_prev)
          self.children.append(node)
        else:
          ctx.unrecognized_files_reporter.append(path)
          logging.info("Ignoring unrecognized file type " + path)
      except OSError:
        logging.info("OSError accessing " + path)
        ctx.oserror_files_reporter.append(path)
        # traceback.print_exc()
      except IOError, (errno, strerror):
        logging.info("IOError %s accessing '%s' %s" % (errno, strerror, path))
        ctx.ioerror_files_reporter.append(path)
        # traceback.print_exc()
      finally:
Example #15
0
def _encode_block_info(seq_idx, container_idx):
    io = StringIO.StringIO()
    io.write(IE.binary_encode_int_varlen(seq_idx))
    io.write(IE.binary_encode_int_varlen(container_idx))
    return io.getvalue()
Example #16
0
def _decode_block_info(encoded):
    io = StringIO.StringIO(encoded)
    seq_idx = IE.binary_read_int_varlen(io)
    container_idx = IE.binary_read_int_varlen(io)
    return (seq_idx, container_idx)
Example #17
0
    #
    # Update the current dir in completed_nodes_db
    #
    cndb = self.backup.get_completed_nodes_db()
    for subdir in subdirs:
      subdir_path = os.path.join(self.path(), subdir)
      subdir_path_digest = Digest.dataDigest(subdir_path.encode('utf8'))
      if cndb.has_key(subdir_path_digest):
        del cndb[subdir_path_digest]
    if self.stats is not None:
      # Stats are empty for the root node, but we don't want to store
      # it in the cndb, because at this point we're already done with the
      # increment anyway
      digest = Digest.dataDigest(self.path().encode('utf8'))
      encoded = (self.digest +
          IntegerEncodings.binary_encode_int_varlen(self.level) +
          IntegerEncodings.binary_encode_int_varlen(self.get_type()) +
          serialize_stats(self.get_stats()))

      if not cndb.has_key(digest) or cndb[digest] != encoded:
        cndb[digest] = encoded
        
    if self.digest != prev_digest:
      #print "changed node", self.path()
      ctx.changed_nodes += 1

  def get_percent_done(self):
    if self.cur_scanned_child is None:
      return self.weight * self.processed_percent
    else:
      return (self.weight * self.processed_percent +
Example #18
0
def encode_container_name(sequence_id, index, extension):
  return "%s.%s.%s" % (base64.urlsafe_b64encode(sequence_id),
      IE.ascii_encode_int_varlen(index), extension)