Ejemplo n.º 1
0
 def do_checksum(self):
     path_to_checksum = var_stack.ResolveVarToStr("__MAIN_INPUT_FILE__")
     ignore_files = var_stack.ResolveVarToList("WTAR_IGNORE_FILES", default=list())
     checksums_dict = utils.get_recursive_checksums(path_to_checksum, ignore=ignore_files)
     total_checksum = checksums_dict.pop('total_checksum', "Unknown total checksum")
     path_and_checksum_list = [(path, checksum) for path, checksum in sorted(checksums_dict.items())]
     width_list, align_list = utils.max_widths(path_and_checksum_list)
     col_formats = utils.gen_col_format(width_list, align_list)
     for p_and_c in path_and_checksum_list:
         print(col_formats[len(p_and_c)].format(*p_and_c))
     print()
     print(col_formats[2].format("total checksum", total_checksum))
Ejemplo n.º 2
0
 def do_checksum(self):
     path_to_checksum = os.fspath(config_vars["__MAIN_INPUT_FILE__"])
     ignore_files = list(config_vars.get("WTAR_IGNORE_FILES", []))
     checksums_dict = utils.get_recursive_checksums(path_to_checksum, ignore=ignore_files)
     total_checksum = checksums_dict.pop('total_checksum', "Unknown total checksum")
     path_and_checksum_list = [(path, checksum) for path, checksum in sorted(checksums_dict.items())]
     width_list, align_list = utils.max_widths(path_and_checksum_list)
     col_formats = utils.gen_col_format(width_list, align_list)
     for p_and_c in path_and_checksum_list:
         print(col_formats[len(p_and_c)].format(*p_and_c))
     print()
     print(col_formats[2].format("total checksum", total_checksum))
Ejemplo n.º 3
0
    def __call__(self, *args, **kwargs) -> None:
        """ Create a new wtar archive for a file or folder provided in self.what_to_wtar

            If self.resolved_where_to_put_wtar is None the new wtar file will be created
                next to the input with extension '.wtar'.
                e.g. the call:
                    Wtar(/a/b/c)
                will create the wtar file at path:
                    /a/b/c.wtar

            If self.resolved_where_to_put_wtar is an existing file, the new wtar will overwrite
                this existing file, wtar extension will NOT be added.
                e.g. assuming /d/e/f.txt is an existing file, the call:
                    Wtar(/a/b/c, /d/e/f.txt)
                will create the wtar file at path:
                    /d/e/f.txt

            if self.resolved_where_to_put_wtar is and existing folder the wtar file will be created
                inside this folder with extension '.wtar'.
                e.g. assuming /g/h/i is an existing folder, the call:
                    Wtar(/a/b/c, /g/h/i)
                will create the wtar file at path:
                    /g/h/i/c.wtar

            if self.resolved_where_to_put_wtar is not None but does not exists, the folder will be created
                and the wtar file will be created inside the new folder with extension
                 '.wtar'.
                e.g. assuming /j/k/l is a non existing folder, the call:
                    Wtar(/a/b/c, /j/k/l)
                will create the wtar file at path:
                    /j/k/l/c.wtar

            "total_checksum" field is added to the pax_headers. This checksum is a checksum of all individual
                file checksums as calculated by utils.get_recursive_checksums. See utils.get_recursive_checksums
                doc string for details on how checksums are calculated. Individual file checksums are not added
                to the pax_headers because during unwtarring tarfile code goes over all the pax_headers for each file
                making the process exponential slow for large archived.

            if wtar file(s) with the same base name as the self.what_to_wtar, the total_checksum of the existing wtar
                will be checked against the total_checksum of the self.what_to_wtar file/folder.
                If total_checksums are identical, the wtar
                will not be created. This will protect against new wtar being created when only the modification date of files
                in the self.what_to_wtar file/folder has changed.
                If total_checksums are no identical the old wtar files wil be removed and a new war created. Removing the old wtars
                ensures that if the number of new wtar split files is smaller than the number of old split files, not extra files wil remain. E.g. if before [a.wtar.aa, a.wtar.ab, a.wtar.ac] and after  [a.wtar.aa, a.wtar.ab] a.wtar.ac will be removed.
            Format of the tar is PAX_FORMAT.
            Compression is bzip2.

        """

        PythonBatchCommandBase.__call__(self, *args, **kwargs)
        resolved_what_to_wtar = utils.ExpandAndResolvePath(self.what_to_wtar)

        if self.where_to_put_wtar is not None:
            resolved_where_to_put_wtar = utils.ExpandAndResolvePath(
                self.where_to_put_wtar)
        else:
            resolved_where_to_put_wtar = resolved_what_to_wtar.parent
            if not resolved_where_to_put_wtar:
                resolved_where_to_put_wtar = Path(os.curdir).resolve()

        if resolved_where_to_put_wtar.is_file():
            target_wtar_file = resolved_where_to_put_wtar
        else:  # assuming it's a folder
            resolved_where_to_put_wtar.mkdir(parents=True, exist_ok=True)
            target_wtar_file = resolved_where_to_put_wtar.joinpath(
                resolved_what_to_wtar.name + ".wtar")

        # remove previous wtarred files
        if target_wtar_file.is_file():
            target_wtar_file.unlink()
        # also look for parts
        target_wtar_dir = target_wtar_file.parent
        parts = target_wtar_dir.glob(target_wtar_file.name + ".wtar.??")
        [p.unlink() for p in parts]

        tar_total_checksum = utils.get_wtar_total_checksum(target_wtar_file)
        ignore_files = list(config_vars.get("WTAR_IGNORE_FILES", []))

        self.doing = f"""wtarring '{resolved_what_to_wtar}' to '{target_wtar_file}''"""
        with utils.ChangeDirIfExists(resolved_what_to_wtar.parent):
            pax_headers = {
                "total_checksum":
                utils.get_recursive_checksums(
                    resolved_what_to_wtar.name,
                    ignore=ignore_files)["total_checksum"]
            }

            def check_tarinfo(tarinfo):
                for ig in ignore_files:
                    if tarinfo.name.endswith(ig):
                        return None
                tarinfo.uid = tarinfo.gid = 0
                tarinfo.uname = tarinfo.gname = "waves"
                if os.path.isfile(tarinfo.path):
                    # wtar should to be idempotent. tarfile code adds "mtime" to
                    # each file's pax_headers. We add "checksum" to pax_headers.
                    # The result is that these two values are written to the tar
                    # file in no particular order and taring the same file twice
                    # might produce different results. By supplying the mtime
                    # ourselves AND passing an OrderedDict as the pax_headers
                    # hopefully the final tar will be the same for different runs.
                    file_pax_headers = OrderedDict()
                    file_pax_headers["checksum"] = utils.get_file_checksum(
                        tarinfo.path)
                    mode_time = str(
                        float(os.lstat(tarinfo.path)[stat.ST_MTIME]))
                    file_pax_headers["mtime"] = mode_time
                    tarinfo.pax_headers = file_pax_headers
                return tarinfo

            compresslevel = 1
            if pax_headers["total_checksum"] != tar_total_checksum:
                if utils.is_first_wtar_file(target_wtar_file):
                    existing_wtar_parts = utils.find_split_files_from_base_file(
                        target_wtar_file)
                    [utils.safe_remove_file(f) for f in existing_wtar_parts]
                with tarfile.open(target_wtar_file,
                                  "w:bz2",
                                  format=tarfile.PAX_FORMAT,
                                  pax_headers=pax_headers,
                                  compresslevel=compresslevel) as tar:
                    tar.add(resolved_what_to_wtar.name, filter=check_tarinfo)

                with SplitFile(target_wtar_file,
                               max_size=self.split_threshold,
                               own_progress_count=0) as sf:
                    sf()
            else:
                log.debug(
                    f"{resolved_what_to_wtar.name} skipped since {resolved_what_to_wtar.name}.wtar already exists and has the same contents"
                )
Ejemplo n.º 4
0
    def unwtar_a_file(self,
                      wtar_file_path: Path,
                      destination_folder: Path,
                      no_artifacts=False,
                      ignore=None,
                      copy_owner=False):
        try:
            wtar_file_paths = utils.find_split_files(wtar_file_path)

            log.debug(f"unwtar {wtar_file_path} to {destination_folder}")
            if ignore is None:
                ignore = ()

            first_wtar_file_dir, first_wtar_file_name = os.path.split(
                wtar_file_paths[0])
            destination_leaf_name = utils.original_name_from_wtar_name(
                first_wtar_file_name)
            destination_path = Path(destination_folder, destination_leaf_name)
            self.doing = f"""unwtar file '{wtar_file_path}' to '{destination_folder} ({"already exists" if destination_path.exists() else "not exists"})'"""

            do_the_unwtarring = True
            with utils.MultiFileReader("br", wtar_file_paths) as fd:
                with tarfile.open(fileobj=fd) as tar:
                    tar_total_checksum = tar.pax_headers.get("total_checksum")
                    # log.debug(f"total checksum for tarfile(s) {wtar_file_paths} {tar_total_checksum}")
                    if tar_total_checksum:
                        if os.path.exists(destination_path):
                            with utils.ChangeDirIfExists(destination_folder):
                                disk_total_checksum = utils.get_recursive_checksums(
                                    destination_leaf_name, ignore=ignore).get(
                                        "total_checksum",
                                        "disk_total_checksum_was_not_found")
                                # log.debug(f"total checksum for destination {destination_folder} {disk_total_checksum}")

                            if disk_total_checksum == tar_total_checksum:
                                do_the_unwtarring = False
                                log.debug(
                                    f"{wtar_file_paths[0]} skipping unwtarring because item exists and is identical to archive"
                                )
                    if do_the_unwtarring:
                        if os.path.exists(destination_path):
                            try:
                                utils.safe_remove_file_system_object(
                                    destination_path, ignore_errors=False)
                            except PermissionError as pe:
                                ChmodAndChown(
                                    destination_path,
                                    "a+rw",
                                    int(config_vars.get("ACTING_UID", -1)),
                                    int(config_vars.get("ACTING_GID", -1)),
                                    recursive=True,
                                    own_progress_count=0)()
                                log.debug(
                                    f"failed to remove {destination_path}, retrying after ChmodAndChow"
                                )
                                utils.safe_remove_file_system_object(
                                    destination_path, ignore_errors=True)
                                log.debug(
                                    f"2nd safe_remove_file_system_object on on {destination_path} done"
                                )
                        tar.extractall(destination_folder)

                        if copy_owner:
                            from pybatch import Chown
                            first_wtar_file_st = os.stat(wtar_file_paths[0])
                            # log.debug(f"copy_owner: {destination_folder} {first_wtar_file_st[stat.ST_UID]}:{first_wtar_file_st[stat.ST_GID]}")
                            Chown(destination_folder,
                                  first_wtar_file_st[stat.ST_UID],
                                  first_wtar_file_st[stat.ST_GID],
                                  recursive=True)()

            if no_artifacts:
                for wtar_file in wtar_file_paths:
                    os.remove(wtar_file)

        except OSError as e:
            log.warning(
                f"Invalid stream on split file with {wtar_file_paths[0]}")
            raise e

        except tarfile.TarError:
            log.warning(
                f"tarfile error while opening file {os.path.abspath(wtar_file_paths[0])}"
            )
            raise
Ejemplo n.º 5
0
    def do_wtar(self):
        """ Create a new wtar archive for a file or folder provided in '--in' command line option

            If --out is not supplied on the command line the new wtar file will be created
                next to the input with extension '.wtar'.
                e.g. the command:
                    instl wtar --in /a/b/c
                will create the wtar file at path:
                    /a/b/c.wtar

            If '--out' is supplied and it's an existing file, the new wtar will overwrite
                this existing file, wtar extension will NOT be added.
                e.g. assuming /d/e/f.txt is an existing file, the command:
                    instl wtar --in /a/b/c --out /d/e/f.txt
                will create the wtar file at path:
                    /d/e/f.txt

            if '--out' is supplied and is and existing folder the wtar file will be created
                inside this folder with extension '.wtar'.
                e.g. assuming /g/h/i is an existing folder, the command:
                    instl wtar --in /a/b/c --out /g/h/i
                will create the wtar file at path:
                    /g/h/i/c.wtar

            if '--out' is supplied and does not exists, the folder will be created
                and the wtar file will be created inside the new folder with extension
                 '.wtar'.
                e.g. assuming /j/k/l is a non existing folder, the command:
                    instl wtar --in /a/b/c --out /j/k/l
                will create the wtar file at path:
                    /j/k/l/c.wtar

            "total_checksum" field is added to the pax_headers. This checksum is a checksum of all individual
                file checksums as calculated by utils.get_recursive_checksums. See utils.get_recursive_checksums
                doc string for details on how checksums are calculated. Individual file checksums are not added
                to the pax_headers because during unwtarring tarfile code goes over all the pax_headers for each file
                making the process exponential slow for large archived.

            if wtar file(s) with the same base name as the --in file/folder, the total_checksum of the existing wtar
                will be checked against the total_checksum of the --in file/folder.
                If total_checksums are identical, the wtar
                will not be created. This will protect against new wtar being created when only the modification date of files
                in the --in file/folder has changed.
                If total_checksums are no identical the old wtar files wil be removed and a new war created. Removing the old wtars
                ensures that if the number of new wtar split files is smaller than the number of old split files, not extra files wil remain. E.g. if before [a.wtar.aa, a.wtar.ab, a.wtar.ac] and after  [a.wtar.aa, a.wtar.ab] a.wtar.ac will be removed.
            Format of the tar is PAX_FORMAT.
            Compression is bzip2.

        """
        what_to_work_on = var_stack.ResolveVarToStr("__MAIN_INPUT_FILE__")
        if not os.path.exists(what_to_work_on):
            print(what_to_work_on, "does not exists")
            return

        what_to_work_on_dir, what_to_work_on_leaf = os.path.split(what_to_work_on)

        where_to_put_wtar = None
        if "__MAIN_OUT_FILE__" in var_stack:
            where_to_put_wtar = var_stack.ResolveVarToStr("__MAIN_OUT_FILE__")
        else:
            where_to_put_wtar = what_to_work_on_dir
            if not where_to_put_wtar:
                where_to_put_wtar = "."

        if os.path.isfile(where_to_put_wtar):
            target_wtar_file = where_to_put_wtar
        else:  # assuming it's a folder
            os.makedirs(where_to_put_wtar, exist_ok=True)
            target_wtar_file = os.path.join(where_to_put_wtar, what_to_work_on_leaf+".wtar")

        tar_total_checksum = utils.get_wtar_total_checksum(target_wtar_file)
        ignore_files = var_stack.ResolveVarToList("WTAR_IGNORE_FILES", default=list())
        with utils.ChangeDirIfExists(what_to_work_on_dir):
            pax_headers = {"total_checksum": utils.get_recursive_checksums(what_to_work_on_leaf, ignore=ignore_files)["total_checksum"]}

            def check_tarinfo(tarinfo):
                for ig in ignore_files:
                    if tarinfo.name.endswith(ig):
                        return None
                tarinfo.uid = tarinfo.gid = 0
                tarinfo.uname = tarinfo.gname = "waves"
                if os.path.isfile(tarinfo.path):
                    # wtar should to be idempotent. tarfile code adds "mtime" to
                    # each file's pax_headers. We add "checksum" to pax_headers.
                    # The result is that these two values are written to the tar
                    # file in no particular order and taring the same file twice
                    # might produce different results. By supplying the mtime
                    # ourselves AND passing an OrderedDict as the pax_headers
                    # hopefully the tar files will be the same each time.
                    file_pax_headers = OrderedDict()
                    file_pax_headers["checksum"] = utils.get_file_checksum(tarinfo.path)
                    mode_time = str(float(os.lstat(tarinfo.path)[stat.ST_MTIME]))
                    file_pax_headers["mtime"] = mode_time
                    tarinfo.pax_headers = file_pax_headers
                return tarinfo
            compresslevel = 1
            if pax_headers["total_checksum"] != tar_total_checksum:
                existing_wtar_parts = utils.find_split_files_from_base_file(what_to_work_on_leaf)
                [utils.safe_remove_file(f) for f in existing_wtar_parts]
                with tarfile.open(target_wtar_file, "w:bz2", format=tarfile.PAX_FORMAT, pax_headers=pax_headers, compresslevel=compresslevel) as tar:
                    tar.add(what_to_work_on_leaf, filter=check_tarinfo)
            else:
                print("{0} skipped since {0}.wtar already exists and has the same contents".format(what_to_work_on))
Ejemplo n.º 6
0
    def unwtar_a_file(self,
                      wtar_file_path: Path,
                      destination_folder: Path,
                      no_artifacts=False,
                      ignore=None,
                      copy_owner=False):
        if ignore is None:
            ignore = ()
        try:
            self.wtar_file_paths = utils.find_split_files(wtar_file_path)

            log.debug(f"unwtar {wtar_file_path} to {destination_folder}")

            destination_leaf_name = utils.original_name_from_wtar_name(
                self.wtar_file_paths[0].name)
            destination_path = destination_folder.joinpath(
                destination_leaf_name)
            self.doing = f"""unwtar file '{wtar_file_path}' to '{destination_folder} ({"already exists" if destination_path.exists() else "not exists"})'"""

            do_the_unwtarring = True
            with utils.MultiFileReader("br", self.wtar_file_paths) as fd:
                with tarfile.open(fileobj=fd) as tar:
                    tar_total_checksum = tar.pax_headers.get("total_checksum")
                    # log.debug(f"total checksum for tarfile(s) {self.wtar_file_paths} {tar_total_checksum}")
                    if tar_total_checksum:
                        try:
                            if destination_path.exists():
                                with utils.ChangeDirIfExists(
                                        destination_folder):
                                    disk_total_checksum = utils.get_recursive_checksums(
                                        destination_leaf_name, ignore=ignore
                                    ).get("total_checksum",
                                          "disk_total_checksum_was_not_found")
                                    # log.debug(f"total checksum for destination {destination_folder} {disk_total_checksum}")

                                if disk_total_checksum == tar_total_checksum:
                                    log.debug(
                                        f"{self.wtar_file_paths[0]} skipping unwtarring because item(s) exist and are identical to archive"
                                    )
                                    do_the_unwtarring = False
                        except:
                            # if checking checksum failed for any reason -> do the unwtarring
                            pass
                    if do_the_unwtarring:
                        with RmDir(destination_path,
                                   report_own_progress=False,
                                   recursive=True) as dir_remover:
                            # RmDir will also remove a file and will not raise if destination_path does not exist
                            dir_remover()
                        tar.extractall(destination_folder)

                        if copy_owner:
                            from pybatch import Chown
                            first_wtar_file_st = self.wtar_file_paths[0].stat()
                            # log.debug(f"copy_owner: {destination_folder} {first_wtar_file_st[stat.ST_UID]}:{first_wtar_file_st[stat.ST_GID]}")
                            Chown(destination_folder,
                                  first_wtar_file_st[stat.ST_UID],
                                  first_wtar_file_st[stat.ST_GID],
                                  recursive=True)()
                    else:
                        log.info(
                            f"skip uwtar of {destination_path} because it exists and matches wtar file checksum"
                        )
            if no_artifacts:
                for wtar_file in self.wtar_file_paths:
                    with RmFile(wtar_file,
                                report_own_progress=False) as wtar_remover:
                        wtar_remover()

        except OSError as e:
            log.warning(
                f"Invalid stream on split file with {self.wtar_file_paths[0]}")
            raise e

        except tarfile.TarError:
            log.warning(
                f"tarfile error while unwtarring file {self.wtar_file_paths[0]}"
            )
            raise