Python DbStore Examples

Programming Language: Python

Namespace/Package Name: dbstore

Class/Type: DbStore

Examples at hotexamples.com: 6

Python DbStore - 6 examples found. These are the top rated real world Python examples of dbstore.DbStore extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DbStore(2)

mput(2)

append(1)

cleanup(1)

Example #1

Show file

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]

Example #2

Show file

File: fwalk.py Project: vikramkhatri/pcircle

    def append_fitem(self, fitem):
        """
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)
        """
        if len(self.flist) < G.memitem_threshold:
            self.flist.append(fitem)
        else:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                if self.use_store == False:
                    self.dbname = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
                    self.flist_db = DbStore(self.dbname)
                    self.use_store = True
                self.flist_db.mput(self.flist_buf)
                del self.flist_buf[:]

Example #3

Show file

File: fwalk.py Project: verolero86/pcircle

    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.workdir = os.getcwd()
        self.tempdir = os.path.join(self.workdir, ".pcircle")
        if not os.path.exists(self.tempdir):
            os.mkdir(self.tempdir)

        if G.use_store:
            self.dbname = "%s/fwalk.%s" % (self.tempdir, circle.rank)
            self.flist = DbStore(self.dbname)
            self.flist_buf = []
        else:
            self.flist = []
        self.src_flist = self.flist

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

Example #4

Show file

File: fwalk.py Project: verolero86/pcircle

class FWalk(BaseTask):

    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.workdir = os.getcwd()
        self.tempdir = os.path.join(self.workdir, ".pcircle")
        if not os.path.exists(self.tempdir):
            os.mkdir(self.tempdir)

        if G.use_store:
            self.dbname = "%s/fwalk.%s" % (self.tempdir, circle.rank)
            self.flist = DbStore(self.dbname)
            self.flist_buf = []
        else:
            self.flist = []
        self.src_flist = self.flist

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" % (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count), extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file,st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" % (src_file, dest_file),
                                 extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() - self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" % (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        total_dirs = self.circle.comm.reduce(self.cnt_dirs, op=MPI.SUM)
        total_files = self.circle.comm.reduce(self.cnt_files, op=MPI.SUM)
        total_filesize = self.circle.comm.reduce(self.cnt_filesize, op=MPI.SUM)
        total_symlinks = self.circle.comm.reduce(self.sym_links, op=MPI.SUM)
        total_skipped = self.circle.comm.reduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)
        return total_dirs, total_files, total_filesize, total_symlinks, total_skipped

    def epilogue(self):
        total_dirs, total_files, total_filesize, total_symlinks, total_skipped = self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:", total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:", bytes_fmt(total_filesize)))
            if total_files != 0:
                print("\t{:<20}{:<20}".format("Avg file size:", bytes_fmt(total_filesize/float(total_files))))
            print("\t{:<20}{:<20}".format("Tree talk time:", utils.conv_time(self.time_ended - self.time_started)))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

        return total_filesize

    def cleanup(self):
        if G.use_store:
            self.flist.cleanup()

Example #5

Show file

class FCP(BaseTask):
    def __init__(self, circle, src, dest,
                 treewalk=None,
                 totalsize=0,
                 hostcnt=0,
                 prune=False,
                 verify=False,
                 resume=False,
                 workq=None):
        BaseTask.__init__(self, circle)
        self.circle = circle
        self.treewalk = treewalk
        self.totalsize = totalsize
        self.prune = prune
        self.workq = workq
        self.resume = resume
        self.checkpoint_file = None
        self.checkpoint_db = None
        self.src = src
        self.dest = os.path.abspath(dest)

        self.rfd_cache = LRU(oflimit, callback = self.cb_close_fd)
        self.wfd_cache = LRU(oflimit, callback = self.cb_close_fd)

        self.cnt_filesize_prior = 0
        self.cnt_filesize = 0

        self.blocksize = 1024 * 1024
        self.chunksize = 1024 * 1024

        # debug
        self.d = {"rank": "rank %s" % circle.rank}
        self.wtime_started = MPI.Wtime()
        self.wtime_ended = None
        self.workcnt = 0  # this is the cnt for the enqued items
        self.reduce_items = 0  # this is the cnt for processed items
        if self.treewalk:
            log.debug("treewalk files = %s" % treewalk.flist, extra=self.d)

        # fini_check
        self.fini_cnt = Counter()

        # verify
        self.verify = verify
        self.use_store = False
        if self.verify:
            self.chunksums_mem = []
            self.chunksums_buf = []

        # checkpointing
        self.checkpoint_interval = sys.maxsize
        self.checkpoint_last = MPI.Wtime()

        if self.circle.rank == 0:
            print("Start copying process ...")

    def cb_close_fd(self, k, v):
        try:
            os.close(v)
        except:
            pass

    def set_fixed_chunksize(self, sz):
        self.chunksize = sz

    def set_adaptive_chunksize(self, totalsz):
        self.chunksize = utils.calc_chunksize(totalsz)
        if self.circle.rank == 0:
            print("Adaptive chunksize: %s" % bytes_fmt(self.chunksize))

    def cleanup(self):

        self.rfd_cache.clear()
        self.wfd_cache.clear()

        # remove checkpoint file
        if self.checkpoint_file and os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)
        if self.checkpoint_db and os.path.exists(self.checkpoint_db):
            os.remove(self.checkpoint_db)

        # remove provided checkpoint file
        if G.resume and G.chk_file and os.path.exists(G.chk_file):
            os.remove(G.chk_file)
        if G.resume and G.chk_file_db and os.path.exists(G.chk_file_db):
            os.remove(G.chk_file_db)

        # remove chunksums file
        if self.verify:
            if hasattr(self, "chunksums_db"):
                self.chunksums_db.cleanup()

        # we need to do this because if last job didn't finish cleanly
        # the fwalk files can be found as leftovers
        # and if fcp cleanup has a chance, it should clean up that
        """
        fwalk = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
        if os.path.exists(fwalk):
            os.remove(fwalk)
        """

    def new_fchunk(self, fitem):
        fchunk = FileChunk()  # default cmd = copy
        fchunk.src = fitem.path
        fchunk.dest = destpath(fitem, self.dest)
        return fchunk

    def enq_file(self, fi):
        """ Process a single file, represented by "fi" - FileItem
        It involves chunking this file and equeue all chunks. """

        chunks = fi.st_size // self.chunksize
        remaining = fi.st_size % self.chunksize

        workcnt = 0

        if fi.st_size == 0:  # empty file
            fchunk = self.new_fchunk(fi)
            fchunk.offset = 0
            fchunk.length = 0
            self.enq(fchunk)
            workcnt += 1
        else:
            for i in range(chunks):
                fchunk = self.new_fchunk(fi)
                fchunk.offset = i * self.chunksize
                fchunk.length = self.chunksize
                self.enq(fchunk)
            workcnt += chunks

        if remaining > 0:
            # send remainder
            fchunk = self.new_fchunk(fi)
            fchunk.offset = chunks * self.chunksize
            fchunk.length = remaining
            self.enq(fchunk)
            workcnt += 1

        # save work cnt
        self.workcnt += workcnt

        log.debug("enq_file(): %s, size = %s, workcnt = %s" % (fi.path, fi.st_size, workcnt),
                     extra=self.d)

    def handle_fitem(self, fi):
        if os.path.islink(fi.path):
            dest = destpath(fi, self.dest)
            linkto = os.readlink(fi.path)
            try:
                os.symlink(linkto, dest)
            except Exception as e:
                log.debug("%s, skipping sym link %s" % (e, fi.path), extra=self.d)
        elif stat.S_ISREG(fi.st_mode):
            self.enq_file(fi)  # where chunking takes place

    def create(self):
        """ Each task has one create(), which is invoked by circle ONCE.
        For FCP, each task will handle_fitem() -> enq_file()
        to process each file gathered during the treewalk stage. """

        if not G.use_store and self.workq:  # restart
            self.setq(self.workq)
            return

        if self.resume:
            return

        # construct and enable all copy operations
        # we batch operation hard-coded
        log.info("create() starts, flist length = %s" % len(self.treewalk.flist),
                    extra=self.d)

        # flist in memory
        if len(self.treewalk.flist) > 0:
            for fi in self.treewalk.flist:
                self.handle_fitem(fi)

        # flist in buf
        if len(self.treewalk.flist_buf) > 0:
            for fi in self.treewalk.flist_buf:
                self.handle_fitem(fi)

        # flist in database
        if self.treewalk.use_store:
            while self.treewalk.flist_db.qsize > 0:
                fitems, _ = self.treewalk.flist_db.mget(G.DB_BUFSIZE)
                for fi in fitems:
                    self.handle_fitem(fi)
                self.treewalk.flist_db.mdel(G.DB_BUFSIZE)

        # both memory and databse checkpoint
        if self.checkpoint_file:
            self.do_no_interrupt_checkpoint()
            self.checkpoint_last = MPI.Wtime()

        # gather total_chunks
        self.circle.comm.barrier()
        G.total_chunks = self.circle.comm.allreduce(self.workcnt, op=MPI.SUM)
        #G.total_chunks = self.circle.comm.bcast(G.total_chunks)
        #print("Total chunks: ",G.total_chunks)


    def do_open2(self, k, d, flag):
        """ d is fd cache (either read or write)
        open path 'k' with 'flags' """
        if d.has_key(k):
            return d.get(k)

        fd = -1

        try:
            fd = os.open(k, flag)
        except OSError as e:
            if e.errno == 28:  # no space left
                log.error("Critical error: %s, exit!" % e, extra=self.d)
                self.circle.exit(0)  # should abort
            else:
                log.error("OSError({0}):{1}, skipping {2}".format(e.errno, e.strerror, k), extra=self.d)
        else:
            if fd > 0:
                d.set(k, fd)
        finally:
            return fd

    @staticmethod
    def do_mkdir(work):
        dest = work.dest
        if not os.path.exists(dest):
            os.makedirs(dest)

    def do_copy(self, work):
        src = work.src
        dest = work.dest

        basedir = os.path.dirname(dest)
        if not os.path.exists(basedir):
            os.makedirs(basedir)

        rfd = self.do_open2(src, self.rfd_cache, os.O_RDONLY)
        if rfd < 0:
            return False
        wfd = self.do_open2(dest, self.wfd_cache, os.O_WRONLY | os.O_CREAT)
        if wfd < 0:
            if args.force:
                try:
                    os.unlink(dest)
                except OSError as e:
                    log.error("Failed to unlink %s, %s " % (dest, e), extra=self.d)
                    return False
                else:
                    wfd = self.do_open2(dest, self.wfd_cache, os.O_WRONLY)
            else:
                log.error("Failed to create output file %s" % dest, extra=self.d)
                return False

        # do the actual copy
        self.write_bytes(rfd, wfd, work)

        # update tally
        self.cnt_filesize += work.length

        if G.verbosity > 2:
            log.debug("Transferred %s bytes from:\n\t [%s] to [%s]" %
                         (self.cnt_filesize, src, dest), extra=self.d)

        return True

    def do_no_interrupt_checkpoint(self):
        a = Thread(target=self.do_checkpoint)
        a.start()
        a.join()
        if G.verbosity > 0:
            print("Checkpoint: %s" % self.checkpoint_file)

    def do_checkpoint(self):
        tmp_file = self.checkpoint_file + ".part"
        with open(tmp_file, "wb") as f:
            self.circle.workq.extend(self.circle.workq_buf)
            self.circle.workq_buf.clear()
            cobj = Checkpoint(self.src, self.dest, self.get_workq(), self.totalsize)
            pickle.dump(cobj, f, pickle.HIGHEST_PROTOCOL)
        # POSIX requires rename to be atomic
        os.rename(tmp_file, self.checkpoint_file)

        # copy workq_db database file
        if hasattr(self.circle, "workq_db") and len(self.circle.workq_db) > 0:
            self.checkpoint_db = self.checkpoint_file + ".db"
            if not G.resume:
                shutil.copy2(self.circle.dbname, self.checkpoint_db)
            else:
                # in resume mode, make a copy of current workq db file, which is provided checkpoint db file
                self.workdir = os.getcwd()
                existingCheckpoint = os.path.join(self.workdir,".pcp_workq.%s.%s.db" % (G.rid, self.circle.rank))
                shutil.copy2(existingCheckpoint,self.checkpoint_db)

    def process(self):
        """
        The only work is "copy"
        TODO: clean up other actions such as mkdir/fini_check
        """
        if not G.use_store:
            curtime = MPI.Wtime()
            if curtime - self.checkpoint_last > self.checkpoint_interval:
                self.do_no_interrupt_checkpoint()
                log.info("Checkpointing done ...", extra=self.d)
                self.checkpoint_last = curtime

        work = self.deq()
        self.reduce_items += 1
        if isinstance(work, FileChunk):
            self.do_copy(work)
        else:
            log.warn("Unknown work object: %s" % work, extra=self.d)
            err_and_exit("Not a correct workq format")

    def reduce_init(self, buf):
        buf['cnt_filesize'] = self.cnt_filesize
        if sys.platform == 'darwin':
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        else:
            buf['mem_snapshot'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024

    def reduce(self, buf1, buf2):
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['mem_snapshot'] += buf2['mem_snapshot']
        return buf1

    def reduce_report(self, buf):
        out = ""
        if self.totalsize != 0:
            out += "%.2f %% finished, " % (100 * float(buf['cnt_filesize']) // self.totalsize)

        out += "%s copied" % bytes_fmt(buf['cnt_filesize'])

        if self.circle.reduce_time_interval != 0:
            rate = float(buf['cnt_filesize'] - self.cnt_filesize_prior) // self.circle.reduce_time_interval
            self.cnt_filesize_prior = buf['cnt_filesize']
            out += ", estimated transfer rate: %s/s" % bytes_fmt(rate)

        out += ", memory usage: %s" % bytes_fmt(buf['mem_snapshot'])
        print(out)

    def reduce_finish(self, buf):
        # self.reduce_report(buf)
        pass

    def epilogue(self):
        global taskloads
        self.wtime_ended = MPI.Wtime()
        taskloads = self.circle.comm.gather(self.reduce_items)
        if self.circle.rank == 0:
            if self.totalsize == 0:
                print("\nZero filesize detected, done.\n")
                return
            tlapse = self.wtime_ended - self.wtime_started
            rate = float(self.totalsize) / tlapse
            print("\nFCP Epilogue:\n")
            print("\t{:<20}{:<20}".format("Ending at:", utils.current_time()))
            print("\t{:<20}{:<20}".format("Completed in:", utils.conv_time(tlapse)))
            print("\t{:<20}{:<20}".format("Transfer Rate:", "%s/s" % bytes_fmt(rate)))
            print("\t{:<20}{:<20}".format("Use store chunksums:", "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:", "%s" % self.circle.use_store))
            print("\t{:<20}{:<20}".format("FCP Loads:", "%s" % taskloads))

    def read_then_write(self, rfd, wfd, work, num_of_bytes, m):
        """ core entry point for copy action: first read then write.

        @param num_of_bytes: the exact amount of bytes we will copy
        @return: False if unsuccessful.

        """
        buf = None
        try:
            buf = readn(rfd, num_of_bytes)
        except IOError:
            self.logger.error("Failed to read %s", work.src, extra=self.d)
            return False

        try:
            writen(wfd, buf)
        except IOError:
            self.logger.error("Failed to write %s", work.dest, extra=self.d)
            return False

        if m:
            m.update(buf)

        return True

    def write_bytes(self, rfd, wfd, work):
        os.lseek(rfd, work.offset, os.SEEK_SET)
        os.lseek(wfd, work.offset, os.SEEK_SET)

        m = None
        if self.verify:
            m = hashlib.sha1()

        remaining = work.length
        while remaining != 0:
            if remaining >= self.blocksize:
                self.read_then_write(rfd, wfd, work, self.blocksize, m)
                remaining -= self.blocksize
            else:
                self.read_then_write(rfd, wfd, work, remaining, m)
                remaining = 0

        if self.verify:
            # use src path here
            ck = ChunkSum(work.dest, offset=work.offset, length=work.length,
                          digest=m.hexdigest())

            if len(self.chunksums_mem) < G.memitem_threshold:
                self.chunksums_mem.append(ck)
            else:
                self.chunksums_buf.append(ck)
                if len(self.chunksums_buf) == G.DB_BUFSIZE:
                    if self.use_store == False:
                        self.workdir = os.getcwd()
                        self.chunksums_dbname = "%s/chunksums.%s" % (G.tempdir, self.circle.rank)
                        self.chunksums_db = DbStore(dbname=self.chunksums_dbname)
                        self.use_store = True
                    self.chunksums_db.mput(self.chunksums_buf)
                    del self.chunksums_buf[:]

Example #6

Show file

File: fwalk.py Project: vikramkhatri/pcircle

class FWalk(BaseTask):
    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.use_store = False
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.flist = []
        self.flist_buf = []

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist_db.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" %
                          (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count),
                     extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file, st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" %
                         (src_file, dest_file),
                         extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        """
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)
        """
        if len(self.flist) < G.memitem_threshold:
            self.flist.append(fitem)
        else:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                if self.use_store == False:
                    self.dbname = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
                    self.flist_db = DbStore(self.dbname)
                    self.use_store = True
                self.flist_db.mput(self.flist_buf)
                del self.flist_buf[:]

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() -
                                                        self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" %
              (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        T.total_dirs = self.circle.comm.allreduce(self.cnt_dirs, op=MPI.SUM)
        T.total_files = self.circle.comm.allreduce(self.cnt_files, op=MPI.SUM)
        T.total_filesize = self.circle.comm.allreduce(self.cnt_filesize,
                                                      op=MPI.SUM)
        T.total_symlinks = self.circle.comm.allreduce(self.sym_links,
                                                      op=MPI.SUM)
        T.total_skipped = self.circle.comm.allreduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)

    def epilogue(self):
        self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", T.total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:",
                                          T.total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", T.total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", T.total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:",
                                          bytes_fmt(T.total_filesize)))
            if T.total_files != 0:
                print("\t{:<20}{:<20}".format(
                    "Avg file size:",
                    bytes_fmt(T.total_filesize / float(T.total_files))))
            print("\t{:<20}{:<20}".format(
                "Tree talk time:",
                utils.conv_time(self.time_ended - self.time_started)))
            print("\t{:<20}{:<20}".format("Use store flist:",
                                          "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:",
                                          "%s" % self.circle.use_store))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

    def cleanup(self):
        if hasattr(self, "flist_db"):
            self.flist_db.cleanup()