def __init__(self, input_dir, output_dir, exclude_list=None, exclude_patterns=None, file_chunk_size=None): """ :param input_dir: The directory to be backed up :param output_dir: The directory in which to store the backup :param exclude_list: List of files/directories to be excluded recursively :param exclude_patterns: List of patterns to be matched against full paths of files to be backed up, to decide whether to exclude. Usually regular expressions, the only requirement is that each object in this list implement a ``match()`` method. :param file_chunk_size: Size of chunks in which files will be splitted for storage / versioning. """ self.input_dir = input_dir self.output_dir = output_dir self.exclude_list = exclude_list \ if exclude_list is not None else [] self.exclude_patterns = exclude_patterns \ if exclude_patterns is not None else [] self.file_chunk_size = file_chunk_size \ if file_chunk_size is not None else 1024 ** 2 self.pool_dir = os.path.join(output_dir, 'pool') self.log_dir = os.path.join(output_dir, 'log') self.index_dir = os.path.join(output_dir, 'index') for directory in self.pool_dir, self.log_dir, self.index_dir: if not os.path.exists(directory): os.makedirs(directory) ## Our key/value blobs storage self.pool = PoolManager(self.pool_dir) ## Used for stats self._processed_files_count = 0 self._ignored_files_count = 0 self._failed_files = [] self._total_size = 0
class FilesystemBackupTool(object): def __init__(self, input_dir, output_dir, exclude_list=None, exclude_patterns=None, file_chunk_size=None): """ :param input_dir: The directory to be backed up :param output_dir: The directory in which to store the backup :param exclude_list: List of files/directories to be excluded recursively :param exclude_patterns: List of patterns to be matched against full paths of files to be backed up, to decide whether to exclude. Usually regular expressions, the only requirement is that each object in this list implement a ``match()`` method. :param file_chunk_size: Size of chunks in which files will be splitted for storage / versioning. """ self.input_dir = input_dir self.output_dir = output_dir self.exclude_list = exclude_list \ if exclude_list is not None else [] self.exclude_patterns = exclude_patterns \ if exclude_patterns is not None else [] self.file_chunk_size = file_chunk_size \ if file_chunk_size is not None else 1024 ** 2 self.pool_dir = os.path.join(output_dir, 'pool') self.log_dir = os.path.join(output_dir, 'log') self.index_dir = os.path.join(output_dir, 'index') for directory in self.pool_dir, self.log_dir, self.index_dir: if not os.path.exists(directory): os.makedirs(directory) ## Our key/value blobs storage self.pool = PoolManager(self.pool_dir) ## Used for stats self._processed_files_count = 0 self._ignored_files_count = 0 self._failed_files = [] self._total_size = 0 def run(self, backup_name=None): if backup_name is None: backup_name = time.strftime('%Y-%m-%d_%H-%M-%S') start_time = time.time() result = self.process_node(self.input_dir) end_time = time.time() bck_table_file = os.path.join( self.index_dir, '{}.bktable'.format(backup_name)) bck_info_file = os.path.join( self.index_dir, '{}.bkinfo'.format(backup_name)) # todo: the backup info/table file should be written periodically # in order to prevent data loss in case something goes wrong.. backup_speed = self._total_size / (end_time - start_time) with open(bck_table_file, 'w') as f: f.write(json.dumps(result)) with open(bck_info_file, 'w') as f: f.write(json.dumps({ 'root_dir': self.input_dir, 'start_time': start_time, 'end_time': end_time, 'files_count': self._processed_files_count, 'exclude_count': self._ignored_files_count, 'failed_files': self._failed_files, 'total_size': self._total_size, 'backup_speed': backup_speed, })) logger.debug('FILE bck-table {}'.format(bck_table_file)) logger.debug('FILE bck-info {}'.format(bck_info_file)) logger.debug('STAT start_time {}'.format(start_time)) logger.debug('STAT end_time {}'.format(end_time)) logger.debug('STAT bck_time {}'.format( natural_time_interval(end_time - start_time))) logger.debug('STAT files_count {}'.format(self._processed_files_count)) logger.debug('STAT exclude_count {}'.format(self._ignored_files_count)) logger.debug('STAT failed_files_count {}'.format( len(self._failed_files))) logger.debug('STAT total_size {}'.format( natural_size(self._total_size))) logger.debug('STAT backup_speed {}/s'.format( natural_size(backup_speed))) return result def process_node(self, node_name): """Process a filesystem node""" if not self.should_process_node(node_name): logger.debug("EXCLUDE {}".format(node_name)) self._ignored_files_count += 1 return logger.debug("PROCESS {}".format(node_name)) self._processed_files_count += 1 node_ifmt = stat.S_IFMT(os.lstat(node_name).st_mode) try: # todo: handle unknown node_ifmt too? method = '_process_{}'.format(self._ifmt_name(node_ifmt)) if hasattr(self, method): return getattr(self, method)(node_name) return self.stat(node_name) except: # Anything might happen here, we catch everything.. logger.error("FAILED {}".format(node_name), exc_info=1) self._failed_files.append(node_name) def _process_dir(self, path): st = self.stat(path) st['children'] = list([ self.process_node(os.path.join(path, p)) for p in os.listdir(path) ]) return st def _process_link(self, path): st = self.stat(path) st['link_dest'] = os.readlink(path) return st def _process_file(self, path): st = self.stat(path) self._total_size += st['size'] st['chunks'] = list(self._store_file(path)) return st def stat(self, path): """Wrapper around ``lstat()`` call This function is responsible of returning a dictionary containing all the information needed to correctly restore the file, including name, permissions, chunks, etc. """ st = os.lstat(path) return { 'name': os.path.basename(path), 'mode': st.st_mode, 'inode': st.st_ino, 'device': st.st_dev, 'nlink': st.st_nlink, 'uid': st.st_uid, 'gid': st.st_gid, 'size': st.st_size, 'atime': st.st_atime, 'mtime': st.st_mtime, 'ctime': st.st_ctime, 'imode': stat.S_IMODE(st.st_mode), 'ifmt': stat.S_IFMT(st.st_mode), 'type': self._ifmt_name(stat.S_IFMT(st.st_mode)), } def should_process_node(self, node_name): """Check whether this node should be processed""" for pattern in self.exclude_list: if os.path.sep in pattern: ## This is a full path, exclude files matching ## All paths are relative to input_dir pattern = os.path.join(self.input_dir, pattern) if os.path.samefile(node_name, pattern): return False else: ## Check against file name if os.path.basename(node_name) == pattern: return False for pattern in self.exclude_patterns: if pattern.match(node_name): return False return True def _store_file(self, filename): with open(filename, 'rb') as f: while True: chunk = f.read(self.file_chunk_size) if not chunk: return chunk_id = self.pool.store_blob(chunk) yield chunk_id def _ifmt_name(self, ifmt): return { stat.S_IFDIR: "dir", stat.S_IFCHR: "char", stat.S_IFBLK: "block", stat.S_IFREG: "file", stat.S_IFIFO: "fifo", stat.S_IFLNK: "link", stat.S_IFSOCK: "socket", }.get(ifmt)