Ejemplo n.º 1
0
    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]
Ejemplo n.º 2
0
    def append_fitem(self, fitem):
        """
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)
        """
        if len(self.flist) < G.memitem_threshold:
            self.flist.append(fitem)
        else:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                if self.use_store == False:
                    self.dbname = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
                    self.flist_db = DbStore(self.dbname)
                    self.use_store = True
                self.flist_db.mput(self.flist_buf)
                del self.flist_buf[:]
Ejemplo n.º 3
0
    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.workdir = os.getcwd()
        self.tempdir = os.path.join(self.workdir, ".pcircle")
        if not os.path.exists(self.tempdir):
            os.mkdir(self.tempdir)

        if G.use_store:
            self.dbname = "%s/fwalk.%s" % (self.tempdir, circle.rank)
            self.flist = DbStore(self.dbname)
            self.flist_buf = []
        else:
            self.flist = []
        self.src_flist = self.flist

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None
Ejemplo n.º 4
0
class FWalk(BaseTask):

    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.workdir = os.getcwd()
        self.tempdir = os.path.join(self.workdir, ".pcircle")
        if not os.path.exists(self.tempdir):
            os.mkdir(self.tempdir)

        if G.use_store:
            self.dbname = "%s/fwalk.%s" % (self.tempdir, circle.rank)
            self.flist = DbStore(self.dbname)
            self.flist_buf = []
        else:
            self.flist = []
        self.src_flist = self.flist

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" % (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count), extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file,st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" % (src_file, dest_file),
                                 extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() - self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" % (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        total_dirs = self.circle.comm.reduce(self.cnt_dirs, op=MPI.SUM)
        total_files = self.circle.comm.reduce(self.cnt_files, op=MPI.SUM)
        total_filesize = self.circle.comm.reduce(self.cnt_filesize, op=MPI.SUM)
        total_symlinks = self.circle.comm.reduce(self.sym_links, op=MPI.SUM)
        total_skipped = self.circle.comm.reduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)
        return total_dirs, total_files, total_filesize, total_symlinks, total_skipped

    def epilogue(self):
        total_dirs, total_files, total_filesize, total_symlinks, total_skipped = self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:", total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:", bytes_fmt(total_filesize)))
            if total_files != 0:
                print("\t{:<20}{:<20}".format("Avg file size:", bytes_fmt(total_filesize/float(total_files))))
            print("\t{:<20}{:<20}".format("Tree talk time:", utils.conv_time(self.time_ended - self.time_started)))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

        return total_filesize

    def cleanup(self):
        if G.use_store:
            self.flist.cleanup()
Ejemplo n.º 5
0
class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.checkpoint_db = None
        self.src = src
        self.dest = os.path.abspath(dest)

        self.rfd_cache = LRU(oflimit, callback = self.cb_close_fd)
        self.wfd_cache = LRU(oflimit, callback = self.cb_close_fd)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.use_store = False
        if self.verify:
            self.chunksums_mem = []
            self.chunksums_buf = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def cb_close_fd(self, k, v):
        try:
            os.close(v)
        except:
            pass

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):

        self.rfd_cache.clear()
        self.wfd_cache.clear()

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)
        if self.checkpoint_db and os.path.exists(self.checkpoint_db):
            os.remove(self.checkpoint_db)

        # remove provided checkpoint file
        if G.resume and G.chk_file and os.path.exists(G.chk_file):
            os.remove(G.chk_file)
        if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db):
            os.remove(G.chk_file_db)

        # remove chunksums file
        if self.verify:
            if hasattr(self, "chunksums_db"):
                self.chunksums_db.cleanup()

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that
        """
        fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)
        """

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size // self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        # flist in memory
        if len(self.treewalk.flist) > 0:
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

        # flist in buf
        if len(self.treewalk.flist_buf) > 0:
            for fi in self.treewalk.flist_buf:
                self.handle_fitem(fi)

        # flist in database
        if self.treewalk.use_store:
            while self.treewalk.flist_db.qsize > 0:
                fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist_db.mdel(G.DB_BUFSIZE)

        # both memory and databse checkpoint
        if self.checkpoint_file:
            self.do_no_interrupt_checkpoint()
            self.checkpoint_last = MPI.Wtime()

        # gather total_chunks
        self.circle.comm.barrier()
        G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM)
        #G.total_chunks = self.circle.comm.bcast(G.total_chunks)
        #print("Total chunks: ",G.total_chunks)


    def do_open2(self, k, d, flag):
        """ d is fd cache (either read or write)
        open path 'k' with 'flags' """
        if d.has_key(k):
            return d.get(k)

        fd = -1

        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d.set(k, fd)
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open2(src, self.rfd_cache, os.O_RDONLY)
        if rfd < 0:
            return False
        wfd = self.do_open2(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open2(dest, self.wfd_cache, os.O_WRONLY)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        if G.verbosity > 0:
            print("Checkpoint: %s" % self.checkpoint_file)

    def do_checkpoint(self):
        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            self.circle.workq.extend(self.circle.workq_buf)
            self.circle.workq_buf.clear()
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

        # copy workq_db database file
        if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0:
            self.checkpoint_db = self.checkpoint_file + ".db"
            if not G.resume:
                shutil.copy2(self.circle.dbname, self.checkpoint_db)
            else:
                # in resume mode, make a copy of current workq db file, which is provided checkpoint db file
                self.workdir = os.getcwd()
                existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank))
                shutil.copy2(existingCheckpoint,self.checkpoint_db)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)
            err_and_exit("Not a correct workq format")

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize
        if sys.platform == 'darwin':
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        else:
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['mem_snapshot'] += buf2['mem_snapshot']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) // self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) // self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot'])
        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]
Ejemplo n.º 6
0
class FWalk(BaseTask):
    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.use_store = False
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.flist = []
        self.flist_buf = []

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist_db.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" %
                          (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count),
                     extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file, st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" %
                         (src_file, dest_file),
                         extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        """
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)
        """
        if len(self.flist) < G.memitem_threshold:
            self.flist.append(fitem)
        else:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                if self.use_store == False:
                    self.dbname = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
                    self.flist_db = DbStore(self.dbname)
                    self.use_store = True
                self.flist_db.mput(self.flist_buf)
                del self.flist_buf[:]

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() -
                                                        self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" %
              (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        T.total_dirs = self.circle.comm.allreduce(self.cnt_dirs, op=MPI.SUM)
        T.total_files = self.circle.comm.allreduce(self.cnt_files, op=MPI.SUM)
        T.total_filesize = self.circle.comm.allreduce(self.cnt_filesize,
                                                      op=MPI.SUM)
        T.total_symlinks = self.circle.comm.allreduce(self.sym_links,
                                                      op=MPI.SUM)
        T.total_skipped = self.circle.comm.allreduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)

    def epilogue(self):
        self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", T.total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:",
                                          T.total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", T.total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", T.total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:",
                                          bytes_fmt(T.total_filesize)))
            if T.total_files != 0:
                print("\t{:<20}{:<20}".format(
                    "Avg file size:",
                    bytes_fmt(T.total_filesize / float(T.total_files))))
            print("\t{:<20}{:<20}".format(
                "Tree talk time:",
                utils.conv_time(self.time_ended - self.time_started)))
            print("\t{:<20}{:<20}".format("Use store flist:",
                                          "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:",
                                          "%s" % self.circle.use_store))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

    def cleanup(self):
        if hasattr(self, "flist_db"):
            self.flist_db.cleanup()