Exemple #1
0
    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(
            (self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = (
                "Failed to copy backup file={0} to local disk at={1} err_msg={2}"
                "").format(self.efile_full, self.tx_file, st.message)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)

        # Check that the destination directory exists and has mode 777, if
        # forced then skip checks
        if not self.force:
            surl = self.archive.header['dst']
            url = client.URL(surl.encode("utf-8"))
            fs = self.archive.get_fs(surl)
            st_stat, resp_stat = fs.stat(
                (url.path, + "?eos.ruid=0&eos.rgid=0").encode("utf-8"))

            if st_stat.ok:
                err_msg = ("Failed to stat backup destination url={0}"
                           "").format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            if resp_stat.flags != (client.StatInfoFlags.IS_READABLE
                                   | client.StatInfoFlags.IS_WRITABLE):
                err_msg = ("Backup destination url={0} must have move 777"
                           ).format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)
Exemple #2
0
    def archive_prepare(self):
        """ Prepare requested archive operation.

        Raises:
            IOError: Failed to rename or transfer archive file.
        """
        # Rename archive file in EOS
        efile_url = client.URL(self.efile_full.encode("utf-8"))
        eosf_rename = ''.join(
            [self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        rename_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([
            rename_url.protocol, "://", rename_url.hostid,
            "//proc/user/?mgm.cmd=file&mgm.subcmd=rename"
            "&mgm.path=", efile_url.path, "&mgm.file.source=", efile_url.path,
            "&mgm.file.target=", rename_url.path
        ])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename archive file {0} to {1}, msg={2}"
                       "").format(self.efile_full, rename_url, stderr)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Copy archive file from EOS to the local disk
        self.efile_full = eosf_rename
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(self.efile_full + "?eos.ruid=0&eos.rgid=0",
                            self.tx_file, True)

        if not st.ok:
            err_msg = ("Failed to copy archive file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object
        d2t = (self.oper == self.config.PUT_OP)
        self.archive = ArchiveFile(self.tx_file, d2t)
Exemple #3
0
    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(
            (self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = ("Failed to copy backup file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)
Exemple #4
0
    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy((self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
                            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = ("Failed to copy backup file={0} to local disk at={1} err_msg={2}"
                       "").format(self.efile_full, self.tx_file, st.message)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)

        # Check that the destination directory exists and has mode 777, if
        # forced then skip checks
        if not self.force:
            surl = self.archive.header['dst']
            url = client.URL(surl.encode("utf-8"))
            fs = self.archive.get_fs(surl)
            st_stat, resp_stat = fs.stat((url.path, + "?eos.ruid=0&eos.rgid=0").encode("utf-8"))

            if st_stat.ok:
                err_msg = ("Failed to stat backup destination url={0}"
                           "").format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            if resp_stat.flags != (client.StatInfoFlags.IS_READABLE |
                                   client.StatInfoFlags.IS_WRITABLE):
                err_msg = ("Backup destination url={0} must have move 777").format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)
Exemple #5
0
    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy((self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
                            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = ("Failed to copy backup file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)
Exemple #6
0
    def archive_prepare(self):
        """ Prepare requested archive operation.

        Raises:
            IOError: Failed to rename or transfer archive file.
        """
        # Rename archive file in EOS
        efile_url = client.URL(self.efile_full.encode("utf-8"))
        eosf_rename = ''.join([self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        rename_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([rename_url.protocol, "://", rename_url.hostid,
                           "//proc/user/?mgm.cmd=file&mgm.subcmd=rename"
                           "&mgm.path=", efile_url.path,
                           "&mgm.file.source=", efile_url.path,
                           "&mgm.file.target=", rename_url.path])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename archive file {0} to {1}, msg={2}"
                       "").format(self.efile_full, rename_url, stderr)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Copy archive file from EOS to the local disk
        self.efile_full = eosf_rename
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(self.efile_full + "?eos.ruid=0&eos.rgid=0",
                            self.tx_file, True)

        if not st.ok:
            err_msg = ("Failed to copy archive file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object
        d2t = (self.oper == self.config.PUT_OP)
        self.archive = ArchiveFile(self.tx_file, d2t)
Exemple #7
0
class Transfer(object):
    """ Trasfer archive object

    Attributes:
        req_json (JSON): Command received from the EOS MGM. Needs to contains the
            following entries: cmd, src, opt, uid, gid
        threads (list): List of threads doing partial transfers(CopyProcess jobs)
    """
    def __init__(self, req_json, config):
        self.config = config
        self.oper = req_json['cmd']
        self.uid, self.gid = req_json['uid'], req_json['gid']
        self.do_retry = (req_json['opt'] == self.config.OPT_RETRY)
        self.force = (req_json['opt'] == self.config.OPT_FORCE)
        self.efile_full = req_json['src']
        self.efile_root = self.efile_full[:-(len(self.efile_full) -
                                             self.efile_full.rfind('/') - 1)]
        self.root_dir = self.efile_root[self.efile_root.rfind('//') + 1:]
        self.uuid = sha256(self.root_dir).hexdigest()
        local_file = join(self.config.DIR[self.oper], self.uuid)
        self.tx_file = local_file + ".tx"
        self.list_jobs, self.threads = [], []
        self.pid = os.getpid()
        self.archive = None
        # Special case for inital PUT as we need to copy also the archive file
        self.init_put = self.efile_full.endswith(self.config.ARCH_INIT)
        self.status = "initializing"
        self.lock_status = threading.Lock()
        self.timestamp = time.time()
        self.logger = logging.getLogger("transfer")
        self.thread_status = ThreadStatus(self)

    def get_status(self):
        """ Get current status

        Returns:
            String representing the status
        """
        self.lock_status.acquire()
        ret = self.status
        self.lock_status.release()
        return ret

    def set_status(self, msg):
        """ Set current status

        Args:
            msg (string): New status
        """
        self.lock_status.acquire()
        self.status = msg
        self.lock_status.release()

    def run(self):
        """ Run requested operation - fist call prepare

        Raises:
            IOError
        """
        self.thread_status.start()

        if self.oper in [self.config.PUT_OP, self.config.GET_OP]:
            self.archive_prepare()

            if self.do_retry:
                self.do_retry_transfer()
            else:
                self.do_transfer()
        elif self.oper in [self.config.PURGE_OP, self.config.DELETE_OP]:
            self.archive_prepare()
            self.do_delete((self.oper == self.config.DELETE_OP))
        elif self.oper == self.config.BACKUP_OP:
            self.backup_prepare()
            self.do_backup()

    def archive_prepare(self):
        """ Prepare requested archive operation.

        Raises:
            IOError: Failed to rename or transfer archive file.
        """
        # Rename archive file in EOS
        efile_url = client.URL(self.efile_full.encode("utf-8"))
        eosf_rename = ''.join(
            [self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        rename_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([
            rename_url.protocol, "://", rename_url.hostid,
            "//proc/user/?mgm.cmd=file&mgm.subcmd=rename"
            "&mgm.path=", efile_url.path, "&mgm.file.source=", efile_url.path,
            "&mgm.file.target=", rename_url.path
        ])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename archive file {0} to {1}, msg={2}"
                       "").format(self.efile_full, rename_url, stderr)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Copy archive file from EOS to the local disk
        self.efile_full = eosf_rename
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(self.efile_full + "?eos.ruid=0&eos.rgid=0",
                            self.tx_file, True)

        if not st.ok:
            err_msg = ("Failed to copy archive file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object
        d2t = (self.oper == self.config.PUT_OP)
        self.archive = ArchiveFile(self.tx_file, d2t)

    def do_delete(self, tape_delete):
        """ Delete archive either from disk (purge) or from tape (delete)

        Args:
            tape_delete (boolean): If true delete data from tape, otherwise
            from disk.

        Raises:
            IOError: Failed to delete an entry.
        """
        del_dirs = []
        self.logger.info("Do delete with tape_delete={0}".format(tape_delete))
        # Delete also the archive file saved on tape
        if tape_delete:
            self.archive.del_entry(self.config.ARCH_INIT, False, tape_delete)

        # First remove all the files and then the directories
        for fentry in self.archive.files():
            # d2t is false for both purge and deletion
            self.archive.del_entry(fentry[1], False, tape_delete)

        for dentry in self.archive.dirs():
            # Don't remove the root directory when purging
            if not tape_delete and dentry[1] == './':
                continue

            del_dirs.append(dentry[1])

        # Remove the directories from bottom up
        while len(del_dirs):
            dpath = del_dirs.pop()
            self.archive.del_entry(dpath, True, tape_delete)

        # Remove immutable flag from the EOS sub-tree
        if tape_delete:
            self.archive.make_mutable()

        self.archive_tx_clean(True)

    def do_transfer(self):
        """ Execute a put or get operation.

        Raises:
            IOError when an IO opperations fails.
        """
        t0 = time.time()
        indx_dir = 0

        # Create directories
        for dentry in self.archive.dirs():
            if dentry[1] == "./":
                self.archive.check_root_dir()

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir,
                                              self.archive.header['num_dirs'])
            self.set_status(msg)

        # For GET issue the Prepare2Get for all the files on tape
        self.prepare2get()

        # Copy files
        self.copy_files()

        # For GET set file ownership and permissions
        self.update_file_access()

        # Verify the transferred entries
        self.set_status("verifying")
        check_ok, __ = self.archive.verify(False)

        # For PUT operations wait that all the files are on tape
        # TODO: enable this when we run with XRootD 4.* and have the
        # BACKUP_EXISTS flag
        # if self.archive.d2t:
        #    self.set_status("wait_on_tape")
        #    self.wait_on_tape()

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.archive_tx_clean(check_ok)

    def do_retry_transfer(self):
        """ Execute a put or get retry operation.

        Raises:
            IOError when an IO opperations fails.
        """
        t0 = time.time()
        indx_dir = 0
        err_entry = None
        tx_ok, meta_ok = True, True
        found_checkpoint = False  # flag set when reaching recovery entry

        # Get the first corrupted entry and the type of corruption
        (tx_ok, meta_ok, lst_failed) = self.check_previous_tx()

        if not tx_ok or not meta_ok:
            err_entry = lst_failed[0]

        # Create directories
        for dentry in self.archive.dirs():
            # Search for the recovery checkpoint
            if not found_checkpoint:
                if dentry != err_entry:
                    indx_dir += 1
                    continue
                else:
                    found_checkpoint = True

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir,
                                              self.archive.header['num_dirs'])
            self.set_status(msg)

        if not tx_ok:
            # For GET issue the Prepare2Get for all the files on tape
            self.prepare2get(err_entry, found_checkpoint)

            # Copy files
            self.copy_files(err_entry, found_checkpoint)

            # For GET set file ownership and permissions for all entries
            self.update_file_access(err_entry, found_checkpoint)

        else:
            # For GET metadata errors set file ownership and permissions only
            # for entries after the first corrupted one
            self.update_file_access()

        # Verify the transferred entries
        self.set_status("verifying")
        check_ok, __ = self.archive.verify(False)

        # For PUT operations what that all the files are on tape
        # TODO: enable this when we run with XRootD 4.* and have the
        # BACKUP_EXISTS flag
        # if self.archive.d2t:
        #    self.set_status("wait_on_tape")
        #    self.wait_on_tape()

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.archive_tx_clean(check_ok)

    def tx_clean(self, check_ok):
        """ Clean a backup/archive transfer depending on its type.
        """
        if self.oper == self.config.BACKUP_OP:
            self.backup_tx_clean()
        else:
            self.archive_tx_clean(check_ok)

    def backup_tx_clean(self):
        """ Clean after a backup tranfer by copying the log file in the same
        directory as the destiantion of the backup.
        """
        # Copy local log file to EOS directory
        eos_log = ''.join(
            [self.efile_root, ".sys.b#.backup.log?eos.ruid=0&eos.rgid=0"])
        self.logger.debug("Copy log:{0} to {1}".format(self.config.LOG_FILE,
                                                       eos_log))
        self.config.handler.flush()
        cp_client = client.FileSystem(self.efile_full.encode("utf-8"))
        st, __ = cp_client.copy(self.config.LOG_FILE, eos_log, force=True)

        if not st.ok:
            self.logger.error(("Failed to copy log file {0} to EOS at {1}"
                               "").format(self.config.LOG_FILE, eos_log))
        else:
            # Delete log file if it was successfully copied to EOS
            try:
                os.remove(self.config.LOG_FILE)
            except OSError as __:
                pass

        # Delete all local files associated with this transfer
        try:
            os.remove(self.tx_file)
        except OSError as __:
            pass

        # Join async status thread
        self.thread_status.do_finish()
        self.thread_status.join()

    def archive_tx_clean(self, check_ok):
        """ Clean the transfer by renaming the archive file in EOS adding the
        following extensions:
        .done - the transfer was successful
        .err  - there were errors during the transfer. These are logged in the
             file .archive.log in the same directory.

        Args:
            check_ok (bool): True if no error occured during transfer,
                otherwise false.
        """
        # Rename arch file in EOS to reflect the status
        if not check_ok:
            eosf_rename = ''.join(
                [self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        else:
            eosf_rename = ''.join([
                self.efile_root, self.config.ARCH_FN, ".", self.oper, ".done"
            ])

        old_url = client.URL(self.efile_full.encode("utf-8"))
        new_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([
            old_url.protocol, "://", old_url.hostid, "//proc/user/?",
            "mgm.cmd=file&mgm.subcmd=rename&mgm.path=", old_url.path,
            "&mgm.file.source=", old_url.path, "&mgm.file.target=",
            new_url.path
        ])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename {0} to {1}, msg={2}"
                       "").format(self.efile_full, eosf_rename, stderr)
            self.logger.error(err_msg)
            # TODO: raise IOError
        else:
            # For successful delete operations remove also the archive file
            if self.oper == self.config.DELETE_OP and check_ok:
                fs = client.FileSystem(self.efile_full.encode("utf-8"))
                st_rm, __ = fs.rm(new_url.path + "?eos.ruid=0&eos.rgid=0")

                if not st_rm.ok:
                    warn_msg = "Failed to delete archive {0}".format(
                        new_url.path)
                    self.logger.warning(warn_msg)

        # Copy local log file back to EOS directory and set the ownership to the
        # identity of the client who triggered the archive
        dir_root = self.efile_root[self.efile_root.rfind('//') + 1:]
        eos_log = ''.join([
            old_url.protocol, "://", old_url.hostid, "/", dir_root,
            self.config.ARCH_FN, ".log?eos.ruid=0&eos.rgid=0"
        ])

        self.logger.debug("Copy log:{0} to {1}".format(self.config.LOG_FILE,
                                                       eos_log))
        self.config.handler.flush()
        cp_client = client.FileSystem(self.efile_full.encode("utf-8"))
        st, __ = cp_client.copy(self.config.LOG_FILE, eos_log, force=True)

        if not st.ok:
            self.logger.error(("Failed to copy log file {0} to EOS at {1}"
                               "").format(self.config.LOG_FILE, eos_log))
        else:
            # User triggering archive operation owns the log file
            eos_log_url = client.URL(eos_log)
            fs = client.FileSystem(eos_log.encode("utf-8"))
            arg = ''.join([
                eos_log_url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=",
                self.uid, "&gid=", self.gid
            ])
            xrd_st, __ = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"))

            if not xrd_st.ok:
                err_msg = ("Failed setting ownership of the log file in"
                           " EOS: {0}").format(eos_log)
                self.logger.error(err_msg)
                raise IOError(err_msg)
            else:
                # Delete log if successfully copied to EOS and changed ownership
                try:
                    os.remove(self.config.LOG_FILE)
                except OSError as __:
                    pass

        # Delete all local files associated with this transfer
        try:
            os.remove(self.tx_file)
        except OSError as __:
            pass

        # Join async status thread
        self.thread_status.do_finish()
        self.thread_status.join()

    def copy_files(self, err_entry=None, found_checkpoint=False):
        """ Copy files.

        Note that when doing PUT the layout is not conserved. Therefore, a file
        with 3 replicas will end up as just a simple file in the new location.

        Args:
            err_entry (list): Entry record from the archive file corresponding
                 to the first file/dir that was corrupted.
            found_checkpoint (boolean): If True it means the checkpoint was
                 already found and we don't need to search for it.

        Raises:
            IOError: Copy request failed.
        """
        indx_file = 0
        # For inital PUT copy also the archive file to tape
        if self.init_put:
            __, dst = self.archive.get_endpoints(self.config.ARCH_INIT)
            self.list_jobs.append((self.efile_full + "?eos.ruid=0&eos.rgid=0" +
                                   "&eos.app=archive", dst))

        # Copy files
        for fentry in self.archive.files():
            # Search for the recovery checkpoint
            if self.do_retry and not found_checkpoint:
                if fentry != err_entry:
                    indx_file += 1
                    continue
                else:
                    found_checkpoint = True

            indx_file += 1
            msg = "copy file {0}/{1}".format(indx_file,
                                             self.archive.header['num_files'])
            self.set_status(msg)
            src, dst = self.archive.get_endpoints(fentry[1])
            dfile = dict(zip(self.archive.header['file_meta'], fentry[2:]))

            # Copy file
            if not self.archive.d2t:
                # For GET we also have the dictionary with the metadata

                dst = ''.join([
                    dst, "?eos.ctime=", dfile['ctime'], "&eos.mtime=",
                    dfile['mtime'], "&eos.bookingsize=", dfile['size'],
                    "&eos.targetsize=", dfile['size'],
                    "&eos.ruid=0&eos.rgid=0&eos.app=archive"
                ])

                # If checksum 0 don't enforce it
                if dfile['xs'] != "0":
                    dst = ''.join([dst, "&eos.checksum=", dfile['xs']])

                # For backup we try to read as root from the source
                if self.oper == self.config.BACKUP_OP:
                    if '?' in src:
                        src = ''.join(
                            [src, "&eos.ruid=0&eos.rgid=0&eos.app=archive"])
                    else:
                        src = ''.join(
                            [src, "?eos.ruid=0&eos.rgid=0&eos.app=archive"])

                    # If this is a version file we save it as a 2-replica layout
                    if is_version_file(fentry[1]):
                        dst = ''.join([
                            dst, "&eos.layout.checksum=", dfile['xstype'],
                            "&eos.layout.type=replica&eos.layout.nstripes=2"
                        ])

                    # If time window specified then select only the matching entries
                    if (self.archive.header['twindow_type']
                            and self.archive.header['twindow_val']):
                        twindow_sec = int(self.archive.header['twindow_val'])
                        tentry_sec = int(
                            float(dfile[self.archive.header['twindow_type']]))

                        if tentry_sec < twindow_sec:
                            continue
            else:
                # For PUT read the files from EOS as root
                src = ''.join([src, "?eos.ruid=0&eos.rgid=0&eos.app=archive"])

            self.logger.info("Copying from {0} to {1}".format(src, dst))
            self.list_jobs.append((src, dst, dfile['size']))

            if len(self.list_jobs) >= self.config.BATCH_SIZE:
                st = self.flush_files(False)

                # For archives we fail immediately, for backups it's best-effort
                if not st and self.oper != self.config.BACKUP_OP:
                    err_msg = "Failed to flush files"
                    self.logger.error(err_msg)
                    raise IOError(err_msg)

        # Flush all pending copies and set metadata info for GET operation
        st = self.flush_files(True)

        if not st and self.oper != self.config.BACKUP_OP:
            err_msg = "Failed to flush files"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def flush_files(self, wait_all):
        """ Flush all pending transfers from the list of jobs.

        Args:
            wait_all (bool): If true wait and collect the status from all
                executing threads.

        Returns:
            True if files flushed successfully, otherwise false.
        """
        status = True

        # Wait until a thread from the pool gets freed if we reached the maximum
        # allowed number of running threads
        while len(self.threads) >= self.config.MAX_THREADS:
            remove_indx, retry_threads = [], []

            for indx, thread in enumerate(self.threads):
                thread.join(self.config.JOIN_TIMEOUT)

                # If thread finished get the status and mark it for removal
                if not thread.isAlive():
                    # If failed then attempt a retry
                    if (not thread.xrd_status.ok
                            and thread.retries <= self.config.MAX_RETRIES):

                        self.logger.log(
                            logging.INFO,
                            ("Thread={0} failed, retries={1}").format(
                                thread.ident, thread.retries))
                        rthread = ThreadJob(thread.lst_jobs, thread.retries)
                        rthread.start()
                        retry_threads.append(rthread)
                        remove_indx.append(indx)
                        self.logger.log(
                            logging.INFO,
                            ("New thread={0} doing a retry").format(
                                rthread.ident))
                        continue

                    status = status and thread.xrd_status.ok
                    log_level = logging.INFO if thread.xrd_status.ok else logging.ERROR
                    self.logger.log(
                        log_level, ("Thread={0} status={1} msg={2}").format(
                            thread.ident, thread.xrd_status.ok,
                            thread.xrd_status.message.decode("utf-8")))
                    remove_indx.append(indx)
                    break

            # Remove old/finished threads and add retry ones. For removal we
            # need to start with big indexes first.
            remove_indx.reverse()

            for indx in remove_indx:
                del self.threads[indx]

            self.threads.extend(retry_threads)
            del retry_threads[:]
            del remove_indx[:]

        # If we still have jobs and previous archive jobs were successful or this
        # is a backup operartion (best-effort even if we have failed transfers)
        if (self.list_jobs
                and ((self.oper != self.config.BACKUP_OP and status) or
                     (self.oper == self.config.BACKUP_OP))):
            thread = ThreadJob(self.list_jobs)
            thread.start()
            self.threads.append(thread)
            del self.list_jobs[:]

        # If a previous archive job failed or we need to wait for all jobs to
        # finish then join the threads and collect their status
        if (self.oper != self.config.BACKUP_OP and not status) or wait_all:
            remove_indx, retry_threads = [], []

            while self.threads:
                for indx, thread in enumerate(self.threads):
                    thread.join()

                    # If failed then attempt a retry
                    if (not thread.xrd_status.ok
                            and thread.retries <= self.config.MAX_RETRIES):

                        self.logger.log(
                            logging.INFO,
                            ("Thread={0} failed, retries={1}").format(
                                thread.ident, thread.retries))
                        rthread = ThreadJob(thread.lst_jobs, thread.retries)
                        rthread.start()
                        retry_threads.append(rthread)
                        remove_indx.append(indx)
                        self.logger.log(
                            logging.INFO,
                            ("New thread={0} doing a retry").format(
                                rthread.ident))
                        continue

                    status = status and thread.xrd_status.ok
                    log_level = logging.INFO if thread.xrd_status.ok else logging.ERROR
                    self.logger.log(
                        log_level, ("Thread={0} status={1} msg={2}").format(
                            thread.ident, thread.xrd_status.ok,
                            thread.xrd_status.message.decode("utf-8")))
                    remove_indx.append(indx)

                # Remove old/finished threads and add retry ones. For removal we
                # need to start with big indexes first.
                remove_indx.reverse()

                for indx in remove_indx:
                    del self.threads[indx]

                self.threads.extend(retry_threads)
                del retry_threads[:]
                del remove_indx[:]

        return status

    def update_file_access(self, err_entry=None, found_checkpoint=False):
        """ Set the ownership and the permissions for the files copied to EOS.
        This is done only for GET operation i.e. self.archive.d2t == False.

        Args:
           err_entry (list): Entry record from the archive file corresponding
               to the first file/dir that was corrupted.
           found_checkpoint (boolean): If True, it means the checkpoint was
                 already found and we don't need to search for it i.e. the
                 corrupted entry is a directory.

        Raises:
            IOError: chown or chmod operations failed
        """
        if self.archive.d2t:
            return

        self.set_status("updating file access")
        t0 = time.time()
        oper = 'query'
        metahandler = MetaHandler()
        fs = self.archive.fs_src

        for fentry in self.archive.files():
            # If backup operation and time window specified then update only matching ones
            if self.oper == self.config.BACKUP_OP:
                if self.archive.header['twindow_type'] and self.archive.header[
                        'twindow_val']:
                    dfile = dict(
                        zip(self.archive.header['file_meta'], fentry[2:]))
                    twindow_sec = int(self.archive.header['twindow_val'])
                    tentry_sec = int(
                        float(dfile[self.archive.header['twindow_type']]))

                    if tentry_sec < twindow_sec:
                        continue

            # Search for the recovery checkpoint
            if err_entry and not found_checkpoint:
                if fentry != err_entry:
                    continue
                else:
                    found_checkpoint = True

            __, surl = self.archive.get_endpoints(fentry[1])
            url = client.URL(surl.encode("utf-8"))
            dict_meta = dict(zip(self.archive.header['file_meta'], fentry[2:]))

            # Send the chown async request
            arg = ''.join([
                url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=",
                dict_meta['uid'], "&gid=", dict_meta['gid']
            ])
            xrd_st = fs.query(QueryCode.OPAQUEFILE,
                              arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query chown for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Send the chmod async request
            mode = int(dict_meta['mode'], 8)  # mode is saved in octal format
            arg = ''.join([
                url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chmod&mode=",
                str(mode)
            ])
            xrd_st = fs.query(QueryCode.OPAQUEFILE,
                              arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query chmod for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Send the utime async request to set the mtime
            mtime = dict_meta['mtime']
            mtime_sec, mtime_nsec = mtime.split('.', 1)
            ctime = dict_meta['ctime']
            ctime_sec, ctime_nsec = ctime.split('.', 1)
            arg = ''.join([
                url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=utimes",
                "&tv1_sec=", ctime_sec, "&tv1_nsec=", ctime_nsec, "&tv2_sec=",
                mtime_sec, "&tv2_nsec=", mtime_nsec
            ])
            xrd_st = fs.query(QueryCode.OPAQUEFILE,
                              arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query utimes for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

        status = metahandler.wait(oper)

        if status:
            t1 = time.time()
            self.logger.info("TIMING_update_file_access={0} sec".format(t1 -
                                                                        t0))
        else:
            err_msg = "Failed update file access"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def check_previous_tx(self):
        """ Find checkpoint for a previous run. There are two types of checks
        being done:
        - transfer check = verify that the files exist and have the correct
                           size and checksum
        - metadata check = verify that all the entries have the correct meta-
                           data values set

        Returns:
           (tx_ok, meta_ok, lst_failed): Tuple holding the status of the
           different checks and the list of corrupted entries.
        """
        msg = "verify last run"
        self.set_status(msg)
        meta_ok = False
        # Check for existence, file size and checksum
        tx_ok, lst_failed = self.archive.verify(False, True)

        if tx_ok:
            meta_ok, lst_failed = self.archive.verify(False, False)

            if meta_ok:
                self.do_retry = False
                raise NoErrorException()

        # Delete the corrupted entry if this is a real transfer error
        if not tx_ok:
            err_entry = lst_failed[0]
            is_dir = (err_entry[0] == 'd')
            self.logger.info("Delete corrupted entry={0}".format(err_entry))

            if is_dir:
                self.archive.del_subtree(err_entry[1], None)
            else:
                self.archive.del_entry(err_entry[1], False, None)

        return (tx_ok, meta_ok, lst_failed)

    def prepare2get(self, err_entry=None, found_checkpoint=False):
        """This method is only executed for GET operations and its purpose is
        to issue the Prepapre2Get commands for the files in the archive which
        will later on be copied back to EOS.

        Args:
            err_entry (list): Entry record from the archive file corresponding
                 to the first file/dir that was corrupted.
            found_checkpoint (bool): If True it means the checkpoint was
                 already found and we don't need to search for it.

        Raises:
            IOError: The Prepare2Get request failed.
        """
        if self.archive.d2t:
            return

        count = 0
        limit = 50  # max files per prepare request
        oper = 'prepare'
        self.set_status("prepare2get")
        t0 = time.time()
        lpaths = []
        status = True
        metahandler = MetaHandler()

        for fentry in self.archive.files():
            # Find error checkpoint if not already found
            if err_entry and not found_checkpoint:
                if fentry != err_entry:
                    continue
                else:
                    found_checkpoint = True

            count += 1
            surl, __ = self.archive.get_endpoints(fentry[1])
            lpaths.append(surl[surl.rfind('//') + 1:])

            if len(lpaths) == limit:
                xrd_st = self.archive.fs_dst.prepare(
                    lpaths,
                    PrepareFlags.STAGE,
                    callback=metahandler.register(oper, surl))

                if not xrd_st.ok:
                    __ = metahandler.wait(oper)
                    err_msg = "Failed prepare2get for path={0}".format(surl)
                    self.logger.error(err_msg)
                    raise IOError(err_msg)

                # Wait for batch to be executed
                del lpaths[:]
                status = status and metahandler.wait(oper)
                self.logger.debug(
                    ("Prepare2get done count={0}/{1}"
                     "").format(count, self.archive.header['num_files']))

                if not status:
                    break

        # Send the remaining requests
        if lpaths and status:
            xrd_st = self.archive.fs_dst.prepare(lpaths,
                                                 PrepareFlags.STAGE,
                                                 callback=metahandler.register(
                                                     oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed prepare2get"
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Wait for batch to be executed
            del lpaths[:]
            status = status and metahandler.wait(oper)

        if status:
            t1 = time.time()
            self.logger.info("TIMING_prepare2get={0} sec".format(t1 - t0))
        else:
            err_msg = "Failed prepare2get"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def wait_on_tape(self):
        """ Check and wait that all the files are on tape, which in our case
        means checking the "m" bit. If file is not on tape then suspend the
        current thread for a period between 1 and 10 minutes depending on the
        index of the failed file.
        """
        min_timeout, max_timeout = 5, 1

        while True:
            indx = 0  # index of the first file not on tape
            all_on_tape = True

            for fentry in self.archive.files():
                indx += 1
                __, dst = self.archive.get_endpoints(fentry[1])
                url = client.URL(dst.encode("utf-8"))
                st_stat, resp_stat = self.archive.fs_dst.stat(
                    url.path.encode("utf-8"))

                if not st_stat.ok:
                    err_msg = "Error stat entry={0}".format(dst)
                    self.logger.err(err_msg)
                    raise IOError()

                # Check file is on tape
                if not resp_stat.flags & StatInfoFlags.BACKUP_EXISTS:
                    self.logger.debug(
                        "File {0} is not yet on tape".format(dst))
                    all_on_tape = False
                    break

            if all_on_tape:
                break
            else:
                # Set timeout value
                ratio = indx / int(self.archive.header['num_files'])
                timeout = int(max_timeout * (1 - ratio))

                if timeout < min_timeout:
                    timeout = min_timeout

                self.logger.info(
                    "Going to sleep for {0} seconds".format(timeout))
                sleep(timeout)

    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(
            (self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = (
                "Failed to copy backup file={0} to local disk at={1} err_msg={2}"
                "").format(self.efile_full, self.tx_file, st.message)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)

        # Check that the destination directory exists and has mode 777, if
        # forced then skip checks
        if not self.force:
            surl = self.archive.header['dst']
            url = client.URL(surl.encode("utf-8"))
            fs = self.archive.get_fs(surl)
            st_stat, resp_stat = fs.stat(
                (url.path, + "?eos.ruid=0&eos.rgid=0").encode("utf-8"))

            if st_stat.ok:
                err_msg = ("Failed to stat backup destination url={0}"
                           "").format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            if resp_stat.flags != (client.StatInfoFlags.IS_READABLE
                                   | client.StatInfoFlags.IS_WRITABLE):
                err_msg = ("Backup destination url={0} must have move 777"
                           ).format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

    def do_backup(self):
        """ Perform a backup operation using the provided backup file.
        """
        t0 = time.time()
        indx_dir = 0

        # Root owns the .sys.b#.backup.file
        fs = client.FileSystem(self.efile_full.encode("utf-8"))
        efile_url = client.URL(self.efile_full.encode("utf-8"))
        arg = ''.join([
            efile_url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=0&gid=0"
        ])
        xrd_st, __ = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"))

        if not xrd_st.ok:
            err_msg = "Failed setting ownership of the backup file: {0}".format(
                self.efile_full)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create directories
        for dentry in self.archive.dirs():
            # Do special checks for root directory
            #if dentry[1] == "./":
            #    self.archive.check_root_dir()

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir,
                                              self.archive.header['num_dirs'])
            self.set_status(msg)

        # Copy files and set metadata information
        self.copy_files()
        self.update_file_access()

        self.set_status("verifying")
        check_ok, lst_failed = self.archive.verify(True)
        self.backup_write_status(lst_failed, check_ok)

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.backup_tx_clean()

    def backup_write_status(self, lst_failed, check_ok):
        """ Create backup status file which constains the list of failed files
            to transfer.

            Args:
               lst_filed (list): List of failed file transfers
               check_ok (boolean): True if verification successful, otherwise
                   false
        """
        if not check_ok:
            self.logger.error("Failed verification for {0} entries".format(
                len(lst_failed)))
            fn_status = ''.join([
                self.efile_root, ".sys.b#.backup.err.",
                str(len(lst_failed)), "?eos.ruid=0&eos.rgid=0"
            ])
        else:
            self.logger.info("Backup successful - no errors detected")
            fn_status = ''.join(
                [self.efile_root, ".sys.b#.backup.done?eos.ruid=0&eos.rgid=0"])

        with client.File() as f:
            f.open(fn_status.encode("utf-8"),
                   OpenFlags.UPDATE | OpenFlags.DELETE)
            offset = 0

            for entry in lst_failed:
                buff = "Failed entry={0}\n".format(entry).encode("utf-8")
                f.write(buff, offset, len(buff))
                offset += len(buff)
Exemple #8
0
class Transfer(object):
    """ Trasfer archive object

    Attributes:
        req_json (JSON): Command received from the EOS MGM. Needs to contains the
            following entries: cmd, src, opt, uid, gid
        threads (list): List of threads doing parital transfers(CopyProcess jobs)
    """
    def __init__(self, req_json, config):
        self.config = config
        self.oper = req_json['cmd']
        self.uid, self.gid = req_json['uid'], req_json['gid']
        self.do_retry = (req_json['opt'] == self.config.OPT_RETRY)
        self.efile_full = req_json['src']
        self.efile_root = self.efile_full[:-(len(self.efile_full) - self.efile_full.rfind('/') - 1)]
        self.root_dir = self.efile_root[self.efile_root.rfind('//') + 1:]
        self.uuid = sha256(self.root_dir).hexdigest()
        local_file = join(self.config.DIR[self.oper], self.uuid)
        self.tx_file = local_file + ".tx"
        self.list_jobs, self.threads = [], []
        self.pid = os.getpid()
        self.archive = None
        # Special case for inital PUT as we need to copy also the archive file
        self.init_put = self.efile_full.endswith(self.config.ARCH_INIT)
        self.status = "initializing"
        self.lock_status = threading.Lock()
        self.timestamp = time.time()
        self.logger = logging.getLogger("transfer")
        self.thread_status = ThreadStatus(self)

    def get_status(self):
        """ Get current status

        Returns:
            String representing the status
        """
        self.lock_status.acquire()
        ret = self.status
        self.lock_status.release()
        return ret

    def set_status(self, msg):
        """ Set current status

        Args:
            msg (string): New status
        """
        self.lock_status.acquire()
        self.status = msg
        self.lock_status.release()

    def run(self):
        """ Run requested operation - fist call prepare

        Raises:
            IOError
        """
        self.thread_status.start()

        if self.oper in [self.config.PUT_OP, self.config.GET_OP]:
            self.archive_prepare()

            if self.do_retry:
                self.do_retry_transfer()
            else:
                self.do_transfer()
        elif self.oper in [self.config.PURGE_OP, self.config.DELETE_OP]:
            self.archive_prepare()
            self.do_delete((self.oper == self.config.DELETE_OP))
        elif self.oper == self.config.BACKUP_OP:
            self.backup_prepare()
            self.do_backup()

    def archive_prepare(self):
        """ Prepare requested archive operation.

        Raises:
            IOError: Failed to rename or transfer archive file.
        """
        # Rename archive file in EOS
        efile_url = client.URL(self.efile_full.encode("utf-8"))
        eosf_rename = ''.join([self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        rename_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([rename_url.protocol, "://", rename_url.hostid,
                           "//proc/user/?mgm.cmd=file&mgm.subcmd=rename"
                           "&mgm.path=", efile_url.path,
                           "&mgm.file.source=", efile_url.path,
                           "&mgm.file.target=", rename_url.path])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename archive file {0} to {1}, msg={2}"
                       "").format(self.efile_full, rename_url, stderr)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Copy archive file from EOS to the local disk
        self.efile_full = eosf_rename
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy(self.efile_full + "?eos.ruid=0&eos.rgid=0",
                            self.tx_file, True)

        if not st.ok:
            err_msg = ("Failed to copy archive file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object
        d2t = (self.oper == self.config.PUT_OP)
        self.archive = ArchiveFile(self.tx_file, d2t)

    def do_delete(self, tape_delete):
        """ Delete archive either from disk (purge) or from tape (delete)

        Args:
            tape_delete (boolean): If true delete data from tape, otherwise
            from disk.

        Raises:
            IOError: Failed to delete an entry.
        """
        del_dirs = []
        self.logger.info("Do delete with tape_delete={0}".format(tape_delete))
        # Delete also the archive file saved on tape
        if tape_delete:
            self.archive.del_entry(self.config.ARCH_INIT, False, tape_delete)

        # First remove all the files and then the directories
        for fentry in self.archive.files():
            # d2t is false for both purge and deletion
            self.archive.del_entry(fentry[1], False, tape_delete)

        for dentry in self.archive.dirs():
            # Don't remove the root directory when purging
            if not tape_delete and dentry[1] == './':
                continue

            del_dirs.append(dentry[1])

        # Remove the directories from bottom up
        while len(del_dirs):
            dpath = del_dirs.pop()
            self.archive.del_entry(dpath, True, tape_delete)

        # Remove immutable flag from the EOS sub-tree
        if tape_delete:
            self.archive.make_mutable()

        self.archive_tx_clean(True)

    def do_transfer(self):
        """ Execute a put or get operation.

        Raises:
            IOError when an IO opperations fails.
        """
        t0 = time.time()
        indx_dir = 0

        # Create directories
        for dentry in self.archive.dirs():
            if dentry[1] == "./":
                self.archive.check_root_dir()

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir, self.archive.header['num_dirs'])
            self.set_status(msg)

        # For GET issue the Prepare2Get for all the files on tape
        self.prepare2get()

        # Copy files
        self.copy_files()

        # For GET set file ownership and permissions
        self.update_file_access()

        # Verify the transferred entries
        self.set_status("verifying")
        check_ok, __ = self.archive.verify(False)

        # For PUT operations what that all the files are on tape
        # TODO: enable this when we run with XRootD 4.* and have the
        # BACKUP_EXISTS flag
        # if self.archive.d2t:
        #    self.set_status("wait_on_tape")
        #    self.wait_on_tape()

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.archive_tx_clean(check_ok)

    def do_retry_transfer(self):
        """ Execute a put or get retry operation.

        Raises:
            IOError when an IO opperations fails.
        """
        t0 = time.time()
        indx_dir = 0
        err_entry = None
        tx_ok, meta_ok = True, True
        found_checkpoint = False  # flag set when reaching recovery entry

        # Get the first corrupted entry and the type of corruption
        (tx_ok, meta_ok, lst_failed) = self.check_previous_tx()

        if not tx_ok or not meta_ok:
            err_entry = lst_failed[0]

        # Create directories
        for dentry in self.archive.dirs():
            # Search for the recovery checkpoint
            if not found_checkpoint:
                if dentry != err_entry:
                    indx_dir += 1
                    continue
                else:
                    found_checkpoint = True

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir, self.archive.header['num_dirs'])
            self.set_status(msg)

        if not tx_ok:
            # For GET issue the Prepare2Get for all the files on tape
            self.prepare2get(err_entry, found_checkpoint)

            # Copy files
            self.copy_files(err_entry, found_checkpoint)

            # For GET set file ownership and permissions for all entries
            self.update_file_access(err_entry, found_checkpoint)

        else:
            # For GET metadata errors set file ownership and permissions only
            # for entries after the first corrupted one
            self.update_file_access()

        # Verify the transferred entries
        self.set_status("verifying")
        check_ok, __ = self.archive.verify(False)

        # For PUT operations what that all the files are on tape
        # TODO: enable this when we run with XRootD 4.* and have the
        # BACKUP_EXISTS flag
        # if self.archive.d2t:
        #    self.set_status("wait_on_tape")
        #    self.wait_on_tape()

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.archive_tx_clean(check_ok)

    def tx_clean(self, check_ok):
        """ Clean a backup/archive transfer depending on its type.
        """
        if self.oper == self.config.BACKUP_OP:
            self.backup_tx_clean()
        else:
            self.archive_tx_clean(check_ok)

    def backup_tx_clean(self):
        """ Clean after a backup tranfer by copying the log file in the same
        directory as the destiantion of the backup.
        """
        # Copy local log file to EOS directory
        eos_log = ''.join([self.efile_root, ".sys.b#.backup.log?eos.ruid=0&eos.rgid=0"])
        self.logger.debug("Copy log:{0} to {1}".format(self.config.LOG_FILE, eos_log))
        self.config.handler.flush()
        cp_client = client.FileSystem(self.efile_full.encode("utf-8"))
        st, __ = cp_client.copy(self.config.LOG_FILE, eos_log, force=True)

        if not st.ok:
            self.logger.error(("Failed to copy log file {0} to EOS at {1}"
                               "").format(self.config.LOG_FILE, eos_log))
        else:
            # Delete log file if it was successfully copied to EOS
            try:
                os.remove(self.config.LOG_FILE)
            except OSError as __:
                pass

        # Delete all local files associated with this transfer
        try:
            os.remove(self.tx_file)
        except OSError as __:
            pass

        # Join async status thread
        self.thread_status.do_finish()
        self.thread_status.join()

    def archive_tx_clean(self, check_ok):
        """ Clean the transfer by renaming the archive file in EOS adding the
        following extensions:
        .done - the transfer was successful
        .err  - there were errors during the transfer. These are logged in the
             file .archive.log in the same directory.

        Args:
            check_ok (bool): True if no error occured during transfer,
                otherwise false.
        """
        # Rename arch file in EOS to reflect the status
        if not check_ok:
            eosf_rename = ''.join([self.efile_root, self.config.ARCH_FN, ".", self.oper, ".err"])
        else:
            eosf_rename = ''.join([self.efile_root, self.config.ARCH_FN, ".", self.oper, ".done"])

        old_url = client.URL(self.efile_full.encode("utf-8"))
        new_url = client.URL(eosf_rename.encode("utf-8"))
        frename = ''.join([old_url.protocol, "://", old_url.hostid, "//proc/user/?",
                           "mgm.cmd=file&mgm.subcmd=rename&mgm.path=", old_url.path,
                           "&mgm.file.source=", old_url.path,
                           "&mgm.file.target=", new_url.path])
        (status, __, stderr) = exec_cmd(frename)

        if not status:
            err_msg = ("Failed to rename {0} to {1}, msg={2}"
                       "").format(self.efile_full, eosf_rename, stderr)
            self.logger.error(err_msg)
            # TODO: raise IOError
        else:
            # For successful delete operations remove also the archive file
            if self.oper == self.config.DELETE_OP and check_ok:
                fs = client.FileSystem(self.efile_full.encode("utf-8"))
                st_rm, __ = fs.rm(new_url.path + "?eos.ruid=0&eos.rgid=0")

                if not st_rm.ok:
                    warn_msg = "Failed to delete archive {0}".format(new_url.path)
                    self.logger.warning(warn_msg)

        # Copy local log file back to EOS directory and set the ownership to the
        # identity of the client who triggered the archive
        dir_root = self.efile_root[self.efile_root.rfind('//') + 1:]
        eos_log = ''.join([old_url.protocol, "://", old_url.hostid, "/",
                           dir_root, self.config.ARCH_FN, ".log?eos.ruid=0&eos.rgid=0"])

        self.logger.debug("Copy log:{0} to {1}".format(self.config.LOG_FILE, eos_log))
        self.config.handler.flush()
        cp_client = client.FileSystem(self.efile_full.encode("utf-8"))
        st, __ = cp_client.copy(self.config.LOG_FILE, eos_log, force=True)

        if not st.ok:
            self.logger.error(("Failed to copy log file {0} to EOS at {1}"
                               "").format(self.config.LOG_FILE, eos_log))
        else:
            # User triggering archive operation owns the log file
            eos_log_url = client.URL(eos_log)
            fs = client.FileSystem(eos_log.encode("utf-8"))
            arg = ''.join([eos_log_url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=",
                           self.uid, "&gid=", self.gid])
            xrd_st, __ = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"))

            if not xrd_st.ok:
                err_msg = ("Failed setting ownership of the log file in"
                           " EOS: {0}").format(eos_log)
                self.logger.error(err_msg)
                raise IOError(err_msg)
            else:
                # Delete log if successfully copied to EOS and changed ownership
                try:
                    os.remove(self.config.LOG_FILE)
                except OSError as __:
                    pass

        # Delete all local files associated with this transfer
        try:
            os.remove(self.tx_file)
        except OSError as __:
            pass

        # Join async status thread
        self.thread_status.do_finish()
        self.thread_status.join()

    def copy_files(self, err_entry=None, found_checkpoint=False):
        """ Copy files.

        Note that when doing PUT the layout is not conserved. Therefore, a file
        with 3 replicas will end up as just a simple file in the new location.

        Args:
            err_entry (list): Entry record from the archive file corresponding
                 to the first file/dir that was corrupted.
            found_checkpoint (boolean): If True it means the checkpoint was
                 already found and we don't need to search for it.

        Raises:
            IOError: Copy request failed.
        """
        indx_file = 0
        # For inital PUT copy also the archive file to tape
        if self.init_put:
            __, dst = self.archive.get_endpoints(self.config.ARCH_INIT)
            self.list_jobs.append((self.efile_full + "?eos.ruid=0&eos.rgid=0", dst))

        # Copy files
        for fentry in self.archive.files():
            # Search for the recovery checkpoint
            if self.do_retry and not found_checkpoint:
                if fentry != err_entry:
                    indx_file += 1
                    continue
                else:
                    found_checkpoint = True

            indx_file += 1
            msg = "copy file {0}/{1}".format(indx_file, self.archive.header['num_files'])
            self.set_status(msg)
            src, dst = self.archive.get_endpoints(fentry[1])

            # Copy file
            if not self.archive.d2t:
                # For GET we also have the dictionary with the metadata
                dfile = dict(zip(self.archive.header['file_meta'], fentry[2:]))
                dst = ''.join([dst, "?eos.ctime=", dfile['ctime'],
                               "&eos.mtime=", dfile['mtime'],
                               "&eos.bookingsize=", dfile['size'],
                               "&eos.targetsize=", dfile['size'],
                               "&eos.ruid=0&eos.rgid=0"])

                # If checksum 0 don't enforce it
                if dfile['xs'] != "0":
                    dst = ''.join([dst, "&eos.checksum=", dfile['xs']])

                # For backup we try to read as root from the source
                if self.oper == self.config.BACKUP_OP:
                    if '?' in src:
                        src = ''.join([src, "&eos.ruid=0&eos.rgid=0"])
                    else:
                        src = ''.join([src, "?eos.ruid=0&eos.rgid=0"])

                    # If this is a version file we save it as a 2-replica layout
                    if is_version_file(fentry[1]):
                        dst = ''.join([dst, "&eos.layout.checksum=", dfile['xstype'],
                                       "&eos.layout.type=replica&eos.layout.nstripes=2"])

                    # If time window specified then select only the matching entries
                    if (self.archive.header['twindow_type'] and
                            self.archive.header['twindow_val']):
                        twindow_sec = int(self.archive.header['twindow_val'])
                        tentry_sec = int(float(dfile[self.archive.header['twindow_type']]))

                        if tentry_sec < twindow_sec:
                            continue
            else:
                # For PUT read the files from EOS as root
                src = ''.join([src, "?eos.ruid=0&eos.rgid=0"])

            self.logger.info("Copying from {0} to {1}".format(src, dst))
            self.list_jobs.append((src, dst))

            if len(self.list_jobs) >= self.config.BATCH_SIZE:
                st = self.flush_files(False)

                # For archives we fail immediately, for backups it's best-effort
                if not st and self.oper != self.config.BACKUP_OP:
                    err_msg = "Failed to flush files"
                    self.logger.error(err_msg)
                    raise IOError(err_msg)

        # Flush all pending copies and set metadata info for GET operation
        st = self.flush_files(True)

        if not st and self.oper != self.config.BACKUP_OP:
            err_msg = "Failed to flush files"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def flush_files(self, wait_all):
        """ Flush all pending transfers from the list of jobs.

        Args:
            wait_all (bool): If true wait and collect the status from all
                executing threads.

        Returns:
            True if files flushed successfully, otherwise false.
        """
        status = True

        # Wait until a thread from the pool gets freed if we reached the maximum
        # allowed number of running threads
        while len(self.threads) >= self.config.MAX_THREADS:
            for indx, thread in enumerate(self.threads):
                thread.join(self.config.JOIN_TIMEOUT)

                # If thread finished get the status and mark it for removal
                if not thread.isAlive():
                    status = status and thread.xrd_status.ok
                    log_level = logging.INFO if thread.xrd_status.ok else logging.ERROR
                    self.logger.log(log_level,
                                    ("Thread={0} status={1} msg={2}"
                                     "").format(thread.ident, thread.xrd_status.ok,
                                                thread.xrd_status.message.decode("utf-8")))
                    del self.threads[indx]
                    break

        # If we still have jobs and previous archive jobs were successful or this
        # is a backup operartion (best-effort even if we have failed transfers)
        if (self.list_jobs and ((self.oper != self.config.BACKUP_OP and status) or
                                (self.oper == self.config.BACKUP_OP))):
            proc = client.CopyProcess()

            for job in self.list_jobs:
                # TODO: use the parallel mode starting with XRootD 4.1
                proc.add_job(job[0].encode("utf-8"), job[1].encode("utf-8"),
                             force=self.do_retry, thirdparty=True)

            del self.list_jobs[:]
            thread = ThreadJob(proc)
            thread.start()
            self.threads.append(thread)

        # If a previous archive job failed or we need to wait for all jobs to
        # finish then join the threads and collect their status
        if (self.oper != self.config.BACKUP_OP and not status) or wait_all:
            for thread in self.threads:
                thread.join()
                status = status and thread.xrd_status.ok
                log_level = logging.INFO if thread.xrd_status.ok else logging.ERROR
                self.logger.log(log_level,
                                ("Thread={0} status={1} msg={2}"
                                 "").format(thread.ident, thread.xrd_status.ok,
                                            thread.xrd_status.message.decode("utf-8")))

            del self.threads[:]

        return status

    def update_file_access(self, err_entry=None, found_checkpoint=False):
        """ Set the ownership and the permissions for the files copied to EOS.
        This is done only for GET operation i.e. self.archive.d2t == False.

        Args:
           err_entry (list): Entry record from the archive file corresponding
               to the first file/dir that was corrupted.
           found_checkpoint (boolean): If True, it means the checkpoint was
                 already found and we don't need to search for it i.e. the
                 corrupted entry is a directory.

        Raises:
            IOError: chown or chmod operations failed
        """
        if self.archive.d2t:
            return

        self.set_status("updating file access")
        t0 = time.time()
        oper = 'query'
        metahandler = MetaHandler()
        fs = self.archive.fs_src

        for fentry in self.archive.files():
            # If backup operation and time window specified then update only matching ones
            if self.oper == self.config.BACKUP_OP:
                if self.archive.header['twindow_type'] and self.archive.header['twindow_val']:
                    dfile = dict(zip(self.archive.header['file_meta'], fentry[2:]))
                    twindow_sec = int(self.archive.header['twindow_val'])
                    tentry_sec = int(float(dfile[self.archive.header['twindow_type']]))

                    if tentry_sec < twindow_sec:
                        continue

            # Search for the recovery checkpoint
            if err_entry and not found_checkpoint:
                if fentry != err_entry:
                    continue
                else:
                    found_checkpoint = True

            __, surl = self.archive.get_endpoints(fentry[1])
            url = client.URL(surl.encode("utf-8"))
            dict_meta = dict(zip(self.archive.header['file_meta'], fentry[2:]))

            # Send the chown async request
            arg = ''.join([url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=",
                           dict_meta['uid'], "&gid=", dict_meta['gid']])
            xrd_st = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query chown for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Send the chmod async request
            mode = int(dict_meta['mode'], 8) # mode is saved in octal format
            arg = ''.join([url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chmod&mode=",
                           str(mode)])
            xrd_st = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query chmod for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Send the utime async request to set the mtime
            mtime = dict_meta['mtime']
            mtime_sec, mtime_nsec = mtime.split('.', 1)
            arg = ''.join([url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=utimes",
                           "&tv1_sec=0&tv1_nsec=0&tv2_sec=", mtime_sec,
                           "&tv2_nsec=", mtime_nsec])
            xrd_st = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"),
                              callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed query chmod for path={0}".format(surl)
                self.logger.error(err_msg)
                raise IOError(err_msg)

        status = metahandler.wait(oper)

        if status:
            t1 = time.time()
            self.logger.info("TIMING_update_file_access={0} sec".format(t1 - t0))
        else:
            err_msg = "Failed update file access"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def check_previous_tx(self):
        """ Find checkpoint for a previous run. There are two types of checks
        being done:
        - transfer check = verify that the files exist and have the correct
                           size and checksum
        - metadata check = verify that all the entries have the correct meta-
                           data values set

        Returns:
           (tx_ok, meta_ok, lst_failed): Tuple holding the status of the
           different checks and the list of corrupted entries.
        """
        msg = "verify last run"
        self.set_status(msg)
        meta_ok = False
        # Check for existence, file size and checksum
        tx_ok, lst_failed = self.archive.verify(False, True)

        if tx_ok:
            meta_ok, lst_failed = self.archive.verify(False, False)

            if meta_ok:
                self.do_retry = False
                raise NoErrorException()

        # Delete the corrupted entry if this is a real transfer error
        if not tx_ok:
            err_entry = lst_failed[0]
            is_dir = (err_entry[0] == 'd')
            self.logger.info("Delete corrupted entry={0}".format(err_entry))

            if is_dir:
                self.archive.del_subtree(err_entry[1], None)
            else:
                self.archive.del_entry(err_entry[1], False, None)

        return (tx_ok, meta_ok, lst_failed)


    def prepare2get(self, err_entry=None, found_checkpoint=False):
        """This method is only executed for GET operations and it's purpose is
        to issue the Prepapre2Get commands for the files in the archive which
        will later on be copied back to EOS.

        Args:
            err_entry (list): Entry record from the archive file corresponding
                 to the first file/dir that was corrupted.
            found_checkpoint (bool): If True it means the checkpoint was
                 already found and we don't need to search for it.

        Raises:
            IOError: The Prepare2Get request failed.
        """
        if self.archive.d2t:
            return

        count = 0
        limit = 50  # max files per prepare request
        oper = 'prepare'
        self.set_status("prepare2get")
        t0 = time.time()
        lpaths = []
        status = True
        metahandler = MetaHandler()

        for fentry in self.archive.files():
            # Find error checkpoint if not already found
            if err_entry and not found_checkpoint:
                if fentry != err_entry:
                    continue
                else:
                    found_checkpoint = True

            count += 1
            surl, __ = self.archive.get_endpoints(fentry[1])
            lpaths.append(surl[surl.rfind('//') + 1:])

            if len(lpaths) == limit:
                xrd_st = self.archive.fs_dst.prepare(lpaths, PrepareFlags.STAGE,
                                                     callback=metahandler.register(oper, surl))

                if not xrd_st.ok:
                    __ = metahandler.wait(oper)
                    err_msg = "Failed prepare2get for path={0}".format(surl)
                    self.logger.error(err_msg)
                    raise IOError(err_msg)

                # Wait for batch to be executed
                del lpaths[:]
                status = status and metahandler.wait(oper)
                self.logger.debug(("Prepare2get done count={0}/{1}"
                                   "").format(count, self.archive.header['num_files']))

                if not status:
                    break

        # Send the remaining requests
        if lpaths and status:
            xrd_st = self.archive.fs_dst.prepare(lpaths, PrepareFlags.STAGE,
                                                 callback=metahandler.register(oper, surl))

            if not xrd_st.ok:
                __ = metahandler.wait(oper)
                err_msg = "Failed prepare2get"
                self.logger.error(err_msg)
                raise IOError(err_msg)

            # Wait for batch to be executed
            del lpaths[:]
            status = status and metahandler.wait(oper)

        if status:
            t1 = time.time()
            self.logger.info("TIMING_prepare2get={0} sec".format(t1 - t0))
        else:
            err_msg = "Failed prepare2get"
            self.logger.error(err_msg)
            raise IOError(err_msg)

    def wait_on_tape(self):
        """ Check and wait that all the files are on tape, which in our case
        means checking the "m" bit. If file is not on tape then suspend the
        current thread for a period between 1 and 10 minutes depending on the
        index of the failed file.
        """
        min_timeout, max_timeout = 5, 1

        while True:
            indx = 0 # index of the first file not on tape
            all_on_tape = True

            for fentry in self.archive.files():
                indx += 1
                __, dst = self.archive.get_endpoints(fentry[1])
                url = client.URL(dst.encode("utf-8"))
                st_stat, resp_stat = self.archive.fs_dst.stat(url.path.encode("utf-8"))

                if not st_stat.ok:
                    err_msg = "Error stat entry={0}".format(dst)
                    self.logger.err(err_msg)
                    raise IOError()

                # Check file is on tape
                if not resp_stat.flags & StatInfoFlags.BACKUP_EXISTS:
                    self.logger.debug("File {0} is not yet on tape".format(dst))
                    all_on_tape = False
                    break

            if all_on_tape:
                break
            else:
                # Set timeout value
                ratio = indx / int(self.archive.header['num_files'])
                timeout = int(max_timeout * (1 - ratio))

                if timeout < min_timeout:
                    timeout = min_timeout

                self.logger.info("Going to sleep for {0} seconds".format(timeout))
                sleep(timeout)

    def backup_prepare(self):
        """ Prepare requested backup operation.

        Raises:
            IOError: Failed to transfer backup file.
        """
        # Copy backup file from EOS to the local disk
        self.logger.info(("Prepare backup copy from {0} to {1}"
                          "").format(self.efile_full, self.tx_file))
        eos_fs = client.FileSystem(self.efile_full.encode("utf-8"))
        st, _ = eos_fs.copy((self.efile_full + "?eos.ruid=0&eos.rgid=0").encode("utf-8"),
                            self.tx_file.encode("utf-8"), True)

        if not st.ok:
            err_msg = ("Failed to copy backup file={0} to local disk at={1}"
                       "").format(self.efile_full, self.tx_file)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create the ArchiveFile object for the backup which is similar to a
        # tape to disk transfer
        self.archive = ArchiveFile(self.tx_file, False)

    def do_backup(self):
        """ Perform a backup operation using the provided backup file.
        """
        t0 = time.time()
        indx_dir = 0

        # Root owns the .sys.b#.backup.file
        fs = client.FileSystem(self.efile_full)
        efile_url = client.URL(self.efile_full)
        arg = ''.join([efile_url.path, "?eos.ruid=0&eos.rgid=0&mgm.pcmd=chown&uid=0&gid=0"])
        xrd_st, __ = fs.query(QueryCode.OPAQUEFILE, arg.encode("utf-8"))

        if not xrd_st.ok:
            err_msg = "Failed setting ownership of the backup file: {0}".format(self.efile_full)
            self.logger.error(err_msg)
            raise IOError(err_msg)

        # Create directories
        for dentry in self.archive.dirs():
            # Do special checks for root directory
            if dentry[1] == "./":
                self.archive.check_root_dir()

            indx_dir += 1
            self.archive.mkdir(dentry)
            msg = "create dir {0}/{1}".format(indx_dir, self.archive.header['num_dirs'])
            self.set_status(msg)

        # Copy files and set metadata information
        self.copy_files()
        self.update_file_access()

        self.set_status("verifying")
        check_ok, lst_failed = self.archive.verify(True)
        self.backup_write_status(lst_failed, check_ok)

        # Delete empty dirs if this was a backup with a time window
        if self.archive.header['twindow_type'] and self.archive.header['twindow_val']:
            self.archive.del_empty_dirs()

        self.set_status("cleaning")
        self.logger.info("TIMING_transfer={0} sec".format(time.time() - t0))
        self.backup_tx_clean()

    def backup_write_status(self, lst_failed, check_ok):
        """ Create backup status file which constains the list of failed files
            to transfer.

            Args:
               lst_filed (list): List of failed file transfers
               check_ok (boolean): True if verification successful, otherwise
                   false
        """
        if not check_ok:
            self.logger.error("Failed verification for {0} entries".format(len(lst_failed)))
            fn_status = ''.join([self.efile_root, ".sys.b#.backup.err.", str(len(lst_failed)),
                                 "?eos.ruid=0&eos.rgid=0"])
        else:
            self.logger.info("Backup successful - no errors detected")
            fn_status = ''.join([self.efile_root, ".sys.b#.backup.done?eos.ruid=0&eos.rgid=0"])

        with client.File() as f:
            f.open(fn_status.encode("utf-8"), OpenFlags.UPDATE | OpenFlags.DELETE)
            offset = 0

            for entry in lst_failed:
                buff = "Failed entry={0}\n".format(entry).encode("utf-8")
                f.write(buff, offset, len(buff))
                offset += len(buff)