예제 #1
0
    def __init__(self, input_dir, output_dir, exclude_list=None,
                 exclude_patterns=None, file_chunk_size=None):
        """
        :param input_dir:
            The directory to be backed up
        :param output_dir:
            The directory in which to store the backup
        :param exclude_list:
            List of files/directories to be excluded recursively
        :param exclude_patterns:
            List of patterns to be matched against full paths of files
            to be backed up, to decide whether to exclude.
            Usually regular expressions, the only requirement is that
            each object in this list implement a ``match()`` method.
        :param file_chunk_size:
            Size of chunks in which files will be splitted for
            storage / versioning.
        """

        self.input_dir = input_dir
        self.output_dir = output_dir
        self.exclude_list = exclude_list \
            if exclude_list is not None else []
        self.exclude_patterns = exclude_patterns \
            if exclude_patterns is not None else []
        self.file_chunk_size = file_chunk_size \
            if file_chunk_size is not None else 1024 ** 2

        self.pool_dir = os.path.join(output_dir, 'pool')
        self.log_dir = os.path.join(output_dir, 'log')
        self.index_dir = os.path.join(output_dir, 'index')
        for directory in self.pool_dir, self.log_dir, self.index_dir:
            if not os.path.exists(directory):
                os.makedirs(directory)

        ## Our key/value blobs storage
        self.pool = PoolManager(self.pool_dir)

        ## Used for stats
        self._processed_files_count = 0
        self._ignored_files_count = 0
        self._failed_files = []
        self._total_size = 0
예제 #2
0
class FilesystemBackupTool(object):
    def __init__(self, input_dir, output_dir, exclude_list=None,
                 exclude_patterns=None, file_chunk_size=None):
        """
        :param input_dir:
            The directory to be backed up
        :param output_dir:
            The directory in which to store the backup
        :param exclude_list:
            List of files/directories to be excluded recursively
        :param exclude_patterns:
            List of patterns to be matched against full paths of files
            to be backed up, to decide whether to exclude.
            Usually regular expressions, the only requirement is that
            each object in this list implement a ``match()`` method.
        :param file_chunk_size:
            Size of chunks in which files will be splitted for
            storage / versioning.
        """

        self.input_dir = input_dir
        self.output_dir = output_dir
        self.exclude_list = exclude_list \
            if exclude_list is not None else []
        self.exclude_patterns = exclude_patterns \
            if exclude_patterns is not None else []
        self.file_chunk_size = file_chunk_size \
            if file_chunk_size is not None else 1024 ** 2

        self.pool_dir = os.path.join(output_dir, 'pool')
        self.log_dir = os.path.join(output_dir, 'log')
        self.index_dir = os.path.join(output_dir, 'index')
        for directory in self.pool_dir, self.log_dir, self.index_dir:
            if not os.path.exists(directory):
                os.makedirs(directory)

        ## Our key/value blobs storage
        self.pool = PoolManager(self.pool_dir)

        ## Used for stats
        self._processed_files_count = 0
        self._ignored_files_count = 0
        self._failed_files = []
        self._total_size = 0

    def run(self, backup_name=None):
        if backup_name is None:
            backup_name = time.strftime('%Y-%m-%d_%H-%M-%S')

        start_time = time.time()
        result = self.process_node(self.input_dir)
        end_time = time.time()

        bck_table_file = os.path.join(
            self.index_dir, '{}.bktable'.format(backup_name))
        bck_info_file = os.path.join(
            self.index_dir, '{}.bkinfo'.format(backup_name))

        # todo: the backup info/table file should be written periodically
        # in order to prevent data loss in case something goes wrong..

        backup_speed = self._total_size / (end_time - start_time)

        with open(bck_table_file, 'w') as f:
            f.write(json.dumps(result))

        with open(bck_info_file, 'w') as f:
            f.write(json.dumps({
                'root_dir': self.input_dir,
                'start_time': start_time,
                'end_time': end_time,
                'files_count': self._processed_files_count,
                'exclude_count': self._ignored_files_count,
                'failed_files': self._failed_files,
                'total_size': self._total_size,
                'backup_speed': backup_speed,
            }))

        logger.debug('FILE bck-table {}'.format(bck_table_file))
        logger.debug('FILE bck-info {}'.format(bck_info_file))

        logger.debug('STAT start_time {}'.format(start_time))
        logger.debug('STAT end_time {}'.format(end_time))
        logger.debug('STAT bck_time {}'.format(
            natural_time_interval(end_time - start_time)))
        logger.debug('STAT files_count {}'.format(self._processed_files_count))
        logger.debug('STAT exclude_count {}'.format(self._ignored_files_count))
        logger.debug('STAT failed_files_count {}'.format(
            len(self._failed_files)))
        logger.debug('STAT total_size {}'.format(
            natural_size(self._total_size)))
        logger.debug('STAT backup_speed {}/s'.format(
            natural_size(backup_speed)))

        return result

    def process_node(self, node_name):
        """Process a filesystem node"""
        if not self.should_process_node(node_name):
            logger.debug("EXCLUDE {}".format(node_name))
            self._ignored_files_count += 1
            return
        logger.debug("PROCESS {}".format(node_name))
        self._processed_files_count += 1

        node_ifmt = stat.S_IFMT(os.lstat(node_name).st_mode)

        try:
            # todo: handle unknown node_ifmt too?
            method = '_process_{}'.format(self._ifmt_name(node_ifmt))
            if hasattr(self, method):
                return getattr(self, method)(node_name)
            return self.stat(node_name)

        except:  # Anything might happen here, we catch everything..
            logger.error("FAILED {}".format(node_name), exc_info=1)
            self._failed_files.append(node_name)

    def _process_dir(self, path):
        st = self.stat(path)
        st['children'] = list([
            self.process_node(os.path.join(path, p))
            for p in os.listdir(path)
        ])
        return st

    def _process_link(self, path):
        st = self.stat(path)
        st['link_dest'] = os.readlink(path)
        return st

    def _process_file(self, path):
        st = self.stat(path)
        self._total_size += st['size']
        st['chunks'] = list(self._store_file(path))
        return st

    def stat(self, path):
        """Wrapper around ``lstat()`` call

        This function is responsible of returning a dictionary containing
        all the information needed to correctly restore the file, including
        name, permissions, chunks, etc.
        """
        st = os.lstat(path)
        return {
            'name': os.path.basename(path),
            'mode': st.st_mode,
            'inode': st.st_ino,
            'device': st.st_dev,
            'nlink': st.st_nlink,
            'uid': st.st_uid,
            'gid': st.st_gid,
            'size': st.st_size,
            'atime': st.st_atime,
            'mtime': st.st_mtime,
            'ctime': st.st_ctime,
            'imode': stat.S_IMODE(st.st_mode),
            'ifmt': stat.S_IFMT(st.st_mode),
            'type': self._ifmt_name(stat.S_IFMT(st.st_mode)),
        }

    def should_process_node(self, node_name):
        """Check whether this node should be processed"""

        for pattern in self.exclude_list:
            if os.path.sep in pattern:
                ## This is a full path, exclude files matching
                ## All paths are relative to input_dir
                pattern = os.path.join(self.input_dir, pattern)
                if os.path.samefile(node_name, pattern):
                    return False

            else:
                ## Check against file name
                if os.path.basename(node_name) == pattern:
                    return False

        for pattern in self.exclude_patterns:
            if pattern.match(node_name):
                return False

        return True

    def _store_file(self, filename):
        with open(filename, 'rb') as f:
            while True:
                chunk = f.read(self.file_chunk_size)
                if not chunk:
                    return
                chunk_id = self.pool.store_blob(chunk)
                yield chunk_id

    def _ifmt_name(self, ifmt):
        return {
            stat.S_IFDIR: "dir",
            stat.S_IFCHR: "char",
            stat.S_IFBLK: "block",
            stat.S_IFREG: "file",
            stat.S_IFIFO: "fifo",
            stat.S_IFLNK: "link",
            stat.S_IFSOCK: "socket",
        }.get(ifmt)