def do_checksum(self): path_to_checksum = var_stack.ResolveVarToStr("__MAIN_INPUT_FILE__") ignore_files = var_stack.ResolveVarToList("WTAR_IGNORE_FILES", default=list()) checksums_dict = utils.get_recursive_checksums(path_to_checksum, ignore=ignore_files) total_checksum = checksums_dict.pop('total_checksum', "Unknown total checksum") path_and_checksum_list = [(path, checksum) for path, checksum in sorted(checksums_dict.items())] width_list, align_list = utils.max_widths(path_and_checksum_list) col_formats = utils.gen_col_format(width_list, align_list) for p_and_c in path_and_checksum_list: print(col_formats[len(p_and_c)].format(*p_and_c)) print() print(col_formats[2].format("total checksum", total_checksum))
def do_checksum(self): path_to_checksum = os.fspath(config_vars["__MAIN_INPUT_FILE__"]) ignore_files = list(config_vars.get("WTAR_IGNORE_FILES", [])) checksums_dict = utils.get_recursive_checksums(path_to_checksum, ignore=ignore_files) total_checksum = checksums_dict.pop('total_checksum', "Unknown total checksum") path_and_checksum_list = [(path, checksum) for path, checksum in sorted(checksums_dict.items())] width_list, align_list = utils.max_widths(path_and_checksum_list) col_formats = utils.gen_col_format(width_list, align_list) for p_and_c in path_and_checksum_list: print(col_formats[len(p_and_c)].format(*p_and_c)) print() print(col_formats[2].format("total checksum", total_checksum))
def __call__(self, *args, **kwargs) -> None: """ Create a new wtar archive for a file or folder provided in self.what_to_wtar If self.resolved_where_to_put_wtar is None the new wtar file will be created next to the input with extension '.wtar'. e.g. the call: Wtar(/a/b/c) will create the wtar file at path: /a/b/c.wtar If self.resolved_where_to_put_wtar is an existing file, the new wtar will overwrite this existing file, wtar extension will NOT be added. e.g. assuming /d/e/f.txt is an existing file, the call: Wtar(/a/b/c, /d/e/f.txt) will create the wtar file at path: /d/e/f.txt if self.resolved_where_to_put_wtar is and existing folder the wtar file will be created inside this folder with extension '.wtar'. e.g. assuming /g/h/i is an existing folder, the call: Wtar(/a/b/c, /g/h/i) will create the wtar file at path: /g/h/i/c.wtar if self.resolved_where_to_put_wtar is not None but does not exists, the folder will be created and the wtar file will be created inside the new folder with extension '.wtar'. e.g. assuming /j/k/l is a non existing folder, the call: Wtar(/a/b/c, /j/k/l) will create the wtar file at path: /j/k/l/c.wtar "total_checksum" field is added to the pax_headers. This checksum is a checksum of all individual file checksums as calculated by utils.get_recursive_checksums. See utils.get_recursive_checksums doc string for details on how checksums are calculated. Individual file checksums are not added to the pax_headers because during unwtarring tarfile code goes over all the pax_headers for each file making the process exponential slow for large archived. if wtar file(s) with the same base name as the self.what_to_wtar, the total_checksum of the existing wtar will be checked against the total_checksum of the self.what_to_wtar file/folder. If total_checksums are identical, the wtar will not be created. This will protect against new wtar being created when only the modification date of files in the self.what_to_wtar file/folder has changed. If total_checksums are no identical the old wtar files wil be removed and a new war created. Removing the old wtars ensures that if the number of new wtar split files is smaller than the number of old split files, not extra files wil remain. E.g. if before [a.wtar.aa, a.wtar.ab, a.wtar.ac] and after [a.wtar.aa, a.wtar.ab] a.wtar.ac will be removed. Format of the tar is PAX_FORMAT. Compression is bzip2. """ PythonBatchCommandBase.__call__(self, *args, **kwargs) resolved_what_to_wtar = utils.ExpandAndResolvePath(self.what_to_wtar) if self.where_to_put_wtar is not None: resolved_where_to_put_wtar = utils.ExpandAndResolvePath( self.where_to_put_wtar) else: resolved_where_to_put_wtar = resolved_what_to_wtar.parent if not resolved_where_to_put_wtar: resolved_where_to_put_wtar = Path(os.curdir).resolve() if resolved_where_to_put_wtar.is_file(): target_wtar_file = resolved_where_to_put_wtar else: # assuming it's a folder resolved_where_to_put_wtar.mkdir(parents=True, exist_ok=True) target_wtar_file = resolved_where_to_put_wtar.joinpath( resolved_what_to_wtar.name + ".wtar") # remove previous wtarred files if target_wtar_file.is_file(): target_wtar_file.unlink() # also look for parts target_wtar_dir = target_wtar_file.parent parts = target_wtar_dir.glob(target_wtar_file.name + ".wtar.??") [p.unlink() for p in parts] tar_total_checksum = utils.get_wtar_total_checksum(target_wtar_file) ignore_files = list(config_vars.get("WTAR_IGNORE_FILES", [])) self.doing = f"""wtarring '{resolved_what_to_wtar}' to '{target_wtar_file}''""" with utils.ChangeDirIfExists(resolved_what_to_wtar.parent): pax_headers = { "total_checksum": utils.get_recursive_checksums( resolved_what_to_wtar.name, ignore=ignore_files)["total_checksum"] } def check_tarinfo(tarinfo): for ig in ignore_files: if tarinfo.name.endswith(ig): return None tarinfo.uid = tarinfo.gid = 0 tarinfo.uname = tarinfo.gname = "waves" if os.path.isfile(tarinfo.path): # wtar should to be idempotent. tarfile code adds "mtime" to # each file's pax_headers. We add "checksum" to pax_headers. # The result is that these two values are written to the tar # file in no particular order and taring the same file twice # might produce different results. By supplying the mtime # ourselves AND passing an OrderedDict as the pax_headers # hopefully the final tar will be the same for different runs. file_pax_headers = OrderedDict() file_pax_headers["checksum"] = utils.get_file_checksum( tarinfo.path) mode_time = str( float(os.lstat(tarinfo.path)[stat.ST_MTIME])) file_pax_headers["mtime"] = mode_time tarinfo.pax_headers = file_pax_headers return tarinfo compresslevel = 1 if pax_headers["total_checksum"] != tar_total_checksum: if utils.is_first_wtar_file(target_wtar_file): existing_wtar_parts = utils.find_split_files_from_base_file( target_wtar_file) [utils.safe_remove_file(f) for f in existing_wtar_parts] with tarfile.open(target_wtar_file, "w:bz2", format=tarfile.PAX_FORMAT, pax_headers=pax_headers, compresslevel=compresslevel) as tar: tar.add(resolved_what_to_wtar.name, filter=check_tarinfo) with SplitFile(target_wtar_file, max_size=self.split_threshold, own_progress_count=0) as sf: sf() else: log.debug( f"{resolved_what_to_wtar.name} skipped since {resolved_what_to_wtar.name}.wtar already exists and has the same contents" )
def unwtar_a_file(self, wtar_file_path: Path, destination_folder: Path, no_artifacts=False, ignore=None, copy_owner=False): try: wtar_file_paths = utils.find_split_files(wtar_file_path) log.debug(f"unwtar {wtar_file_path} to {destination_folder}") if ignore is None: ignore = () first_wtar_file_dir, first_wtar_file_name = os.path.split( wtar_file_paths[0]) destination_leaf_name = utils.original_name_from_wtar_name( first_wtar_file_name) destination_path = Path(destination_folder, destination_leaf_name) self.doing = f"""unwtar file '{wtar_file_path}' to '{destination_folder} ({"already exists" if destination_path.exists() else "not exists"})'""" do_the_unwtarring = True with utils.MultiFileReader("br", wtar_file_paths) as fd: with tarfile.open(fileobj=fd) as tar: tar_total_checksum = tar.pax_headers.get("total_checksum") # log.debug(f"total checksum for tarfile(s) {wtar_file_paths} {tar_total_checksum}") if tar_total_checksum: if os.path.exists(destination_path): with utils.ChangeDirIfExists(destination_folder): disk_total_checksum = utils.get_recursive_checksums( destination_leaf_name, ignore=ignore).get( "total_checksum", "disk_total_checksum_was_not_found") # log.debug(f"total checksum for destination {destination_folder} {disk_total_checksum}") if disk_total_checksum == tar_total_checksum: do_the_unwtarring = False log.debug( f"{wtar_file_paths[0]} skipping unwtarring because item exists and is identical to archive" ) if do_the_unwtarring: if os.path.exists(destination_path): try: utils.safe_remove_file_system_object( destination_path, ignore_errors=False) except PermissionError as pe: ChmodAndChown( destination_path, "a+rw", int(config_vars.get("ACTING_UID", -1)), int(config_vars.get("ACTING_GID", -1)), recursive=True, own_progress_count=0)() log.debug( f"failed to remove {destination_path}, retrying after ChmodAndChow" ) utils.safe_remove_file_system_object( destination_path, ignore_errors=True) log.debug( f"2nd safe_remove_file_system_object on on {destination_path} done" ) tar.extractall(destination_folder) if copy_owner: from pybatch import Chown first_wtar_file_st = os.stat(wtar_file_paths[0]) # log.debug(f"copy_owner: {destination_folder} {first_wtar_file_st[stat.ST_UID]}:{first_wtar_file_st[stat.ST_GID]}") Chown(destination_folder, first_wtar_file_st[stat.ST_UID], first_wtar_file_st[stat.ST_GID], recursive=True)() if no_artifacts: for wtar_file in wtar_file_paths: os.remove(wtar_file) except OSError as e: log.warning( f"Invalid stream on split file with {wtar_file_paths[0]}") raise e except tarfile.TarError: log.warning( f"tarfile error while opening file {os.path.abspath(wtar_file_paths[0])}" ) raise
def do_wtar(self): """ Create a new wtar archive for a file or folder provided in '--in' command line option If --out is not supplied on the command line the new wtar file will be created next to the input with extension '.wtar'. e.g. the command: instl wtar --in /a/b/c will create the wtar file at path: /a/b/c.wtar If '--out' is supplied and it's an existing file, the new wtar will overwrite this existing file, wtar extension will NOT be added. e.g. assuming /d/e/f.txt is an existing file, the command: instl wtar --in /a/b/c --out /d/e/f.txt will create the wtar file at path: /d/e/f.txt if '--out' is supplied and is and existing folder the wtar file will be created inside this folder with extension '.wtar'. e.g. assuming /g/h/i is an existing folder, the command: instl wtar --in /a/b/c --out /g/h/i will create the wtar file at path: /g/h/i/c.wtar if '--out' is supplied and does not exists, the folder will be created and the wtar file will be created inside the new folder with extension '.wtar'. e.g. assuming /j/k/l is a non existing folder, the command: instl wtar --in /a/b/c --out /j/k/l will create the wtar file at path: /j/k/l/c.wtar "total_checksum" field is added to the pax_headers. This checksum is a checksum of all individual file checksums as calculated by utils.get_recursive_checksums. See utils.get_recursive_checksums doc string for details on how checksums are calculated. Individual file checksums are not added to the pax_headers because during unwtarring tarfile code goes over all the pax_headers for each file making the process exponential slow for large archived. if wtar file(s) with the same base name as the --in file/folder, the total_checksum of the existing wtar will be checked against the total_checksum of the --in file/folder. If total_checksums are identical, the wtar will not be created. This will protect against new wtar being created when only the modification date of files in the --in file/folder has changed. If total_checksums are no identical the old wtar files wil be removed and a new war created. Removing the old wtars ensures that if the number of new wtar split files is smaller than the number of old split files, not extra files wil remain. E.g. if before [a.wtar.aa, a.wtar.ab, a.wtar.ac] and after [a.wtar.aa, a.wtar.ab] a.wtar.ac will be removed. Format of the tar is PAX_FORMAT. Compression is bzip2. """ what_to_work_on = var_stack.ResolveVarToStr("__MAIN_INPUT_FILE__") if not os.path.exists(what_to_work_on): print(what_to_work_on, "does not exists") return what_to_work_on_dir, what_to_work_on_leaf = os.path.split(what_to_work_on) where_to_put_wtar = None if "__MAIN_OUT_FILE__" in var_stack: where_to_put_wtar = var_stack.ResolveVarToStr("__MAIN_OUT_FILE__") else: where_to_put_wtar = what_to_work_on_dir if not where_to_put_wtar: where_to_put_wtar = "." if os.path.isfile(where_to_put_wtar): target_wtar_file = where_to_put_wtar else: # assuming it's a folder os.makedirs(where_to_put_wtar, exist_ok=True) target_wtar_file = os.path.join(where_to_put_wtar, what_to_work_on_leaf+".wtar") tar_total_checksum = utils.get_wtar_total_checksum(target_wtar_file) ignore_files = var_stack.ResolveVarToList("WTAR_IGNORE_FILES", default=list()) with utils.ChangeDirIfExists(what_to_work_on_dir): pax_headers = {"total_checksum": utils.get_recursive_checksums(what_to_work_on_leaf, ignore=ignore_files)["total_checksum"]} def check_tarinfo(tarinfo): for ig in ignore_files: if tarinfo.name.endswith(ig): return None tarinfo.uid = tarinfo.gid = 0 tarinfo.uname = tarinfo.gname = "waves" if os.path.isfile(tarinfo.path): # wtar should to be idempotent. tarfile code adds "mtime" to # each file's pax_headers. We add "checksum" to pax_headers. # The result is that these two values are written to the tar # file in no particular order and taring the same file twice # might produce different results. By supplying the mtime # ourselves AND passing an OrderedDict as the pax_headers # hopefully the tar files will be the same each time. file_pax_headers = OrderedDict() file_pax_headers["checksum"] = utils.get_file_checksum(tarinfo.path) mode_time = str(float(os.lstat(tarinfo.path)[stat.ST_MTIME])) file_pax_headers["mtime"] = mode_time tarinfo.pax_headers = file_pax_headers return tarinfo compresslevel = 1 if pax_headers["total_checksum"] != tar_total_checksum: existing_wtar_parts = utils.find_split_files_from_base_file(what_to_work_on_leaf) [utils.safe_remove_file(f) for f in existing_wtar_parts] with tarfile.open(target_wtar_file, "w:bz2", format=tarfile.PAX_FORMAT, pax_headers=pax_headers, compresslevel=compresslevel) as tar: tar.add(what_to_work_on_leaf, filter=check_tarinfo) else: print("{0} skipped since {0}.wtar already exists and has the same contents".format(what_to_work_on))
def unwtar_a_file(self, wtar_file_path: Path, destination_folder: Path, no_artifacts=False, ignore=None, copy_owner=False): if ignore is None: ignore = () try: self.wtar_file_paths = utils.find_split_files(wtar_file_path) log.debug(f"unwtar {wtar_file_path} to {destination_folder}") destination_leaf_name = utils.original_name_from_wtar_name( self.wtar_file_paths[0].name) destination_path = destination_folder.joinpath( destination_leaf_name) self.doing = f"""unwtar file '{wtar_file_path}' to '{destination_folder} ({"already exists" if destination_path.exists() else "not exists"})'""" do_the_unwtarring = True with utils.MultiFileReader("br", self.wtar_file_paths) as fd: with tarfile.open(fileobj=fd) as tar: tar_total_checksum = tar.pax_headers.get("total_checksum") # log.debug(f"total checksum for tarfile(s) {self.wtar_file_paths} {tar_total_checksum}") if tar_total_checksum: try: if destination_path.exists(): with utils.ChangeDirIfExists( destination_folder): disk_total_checksum = utils.get_recursive_checksums( destination_leaf_name, ignore=ignore ).get("total_checksum", "disk_total_checksum_was_not_found") # log.debug(f"total checksum for destination {destination_folder} {disk_total_checksum}") if disk_total_checksum == tar_total_checksum: log.debug( f"{self.wtar_file_paths[0]} skipping unwtarring because item(s) exist and are identical to archive" ) do_the_unwtarring = False except: # if checking checksum failed for any reason -> do the unwtarring pass if do_the_unwtarring: with RmDir(destination_path, report_own_progress=False, recursive=True) as dir_remover: # RmDir will also remove a file and will not raise if destination_path does not exist dir_remover() tar.extractall(destination_folder) if copy_owner: from pybatch import Chown first_wtar_file_st = self.wtar_file_paths[0].stat() # log.debug(f"copy_owner: {destination_folder} {first_wtar_file_st[stat.ST_UID]}:{first_wtar_file_st[stat.ST_GID]}") Chown(destination_folder, first_wtar_file_st[stat.ST_UID], first_wtar_file_st[stat.ST_GID], recursive=True)() else: log.info( f"skip uwtar of {destination_path} because it exists and matches wtar file checksum" ) if no_artifacts: for wtar_file in self.wtar_file_paths: with RmFile(wtar_file, report_own_progress=False) as wtar_remover: wtar_remover() except OSError as e: log.warning( f"Invalid stream on split file with {self.wtar_file_paths[0]}") raise e except tarfile.TarError: log.warning( f"tarfile error while unwtarring file {self.wtar_file_paths[0]}" ) raise