Example #1
0
class FWalk(BaseTask):

    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.workdir = os.getcwd()
        self.tempdir = os.path.join(self.workdir, ".pcircle")
        if not os.path.exists(self.tempdir):
            os.mkdir(self.tempdir)

        if G.use_store:
            self.dbname = "%s/fwalk.%s" % (self.tempdir, circle.rank)
            self.flist = DbStore(self.dbname)
            self.flist_buf = []
        else:
            self.flist = []
        self.src_flist = self.flist

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" % (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count), extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file,st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" % (src_file, dest_file),
                                 extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() - self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" % (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        total_dirs = self.circle.comm.reduce(self.cnt_dirs, op=MPI.SUM)
        total_files = self.circle.comm.reduce(self.cnt_files, op=MPI.SUM)
        total_filesize = self.circle.comm.reduce(self.cnt_filesize, op=MPI.SUM)
        total_symlinks = self.circle.comm.reduce(self.sym_links, op=MPI.SUM)
        total_skipped = self.circle.comm.reduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)
        return total_dirs, total_files, total_filesize, total_symlinks, total_skipped

    def epilogue(self):
        total_dirs, total_files, total_filesize, total_symlinks, total_skipped = self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:", total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:", bytes_fmt(total_filesize)))
            if total_files != 0:
                print("\t{:<20}{:<20}".format("Avg file size:", bytes_fmt(total_filesize/float(total_files))))
            print("\t{:<20}{:<20}".format("Tree talk time:", utils.conv_time(self.time_ended - self.time_started)))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

        return total_filesize

    def cleanup(self):
        if G.use_store:
            self.flist.cleanup()
Example #2
0
class FWalk(BaseTask):
    def __init__(self, circle, src, dest=None, preserve=False, force=False):
        BaseTask.__init__(self, circle)

        self.d = {"rank": "rank %s" % circle.rank}
        self.circle = circle
        self.src = src
        self.dest = dest
        self.force = force
        self.use_store = False
        self.interval = 10  # progress report

        # For now, I am setting the option
        # TODO: should user allowed to meddle this?
        self.sizeonly = False
        self.checksum = False

        # to be fixed
        self.optlist = []  # files
        self.opt_dir_list = []  # dirs

        self.sym_links = 0
        self.follow_sym_links = False

        self.flist = []
        self.flist_buf = []

        # hold unlinkable dest directories
        # we have to do the --fix-opt at the end
        self.dest_dirs = []

        self.cnt_dirs = 0
        self.cnt_files = 0
        self.cnt_filesize = 0
        self.last_cnt = 0
        self.skipped = 0
        self.last_reduce_time = MPI.Wtime()

        # reduce
        self.reduce_items = 0

        self.time_started = MPI.Wtime()
        self.time_ended = None

    def create(self):
        if self.circle.rank == 0:
            for ele in self.src:
                self.circle.enq(ele)
            print("\nAnalyzing workload ...")

    def copy_xattr(self, src, dest):
        attrs = xattr.listxattr(src)
        for k in attrs:
            try:
                val = xattr.getxattr(src, k)
                xattr.setxattr(dest, k, val)
            except IOError as e:
                log.warn(e, extra=self.d)

    def flushdb(self):
        if len(self.flist_buf) != 0:
            self.flist_db.mput(self.flist_buf)

    def process_dir(self, fitem, st):
        """ i_dir should be absolute path
        st is the stat object associated with the directory
        """
        i_dir = fitem.path

        if self.dest:
            # we create destination directory
            # but first we check if we need to change mode for it to work
            o_dir = destpath(fitem, self.dest)
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                mode = st.st_mode | stat.S_IWUSR
                self.opt_dir_list.append((o_dir, st))
            try:
                os.mkdir(o_dir, mode)
            except OSError as e:
                log.debug("mkdir(): %s" % e, extra=self.d)

            if G.preserve:
                self.copy_xattr(i_dir, o_dir)

        last_report = MPI.Wtime()
        count = 0
        try:
            entries = scandir(i_dir)
        except OSError as e:
            log.warn(e, extra=self.d)
            self.skipped += 1
        else:
            for entry in entries:
                elefi = FileItem(entry.path)
                if fitem.dirname:
                    elefi.dirname = fitem.dirname
                self.circle.enq(elefi)

                count += 1
                if (MPI.Wtime() - last_report) > self.interval:
                    print("Rank %s : Scanning [%s] at %s" %
                          (self.circle.rank, i_dir, count))
                    last_report = MPI.Wtime()
            log.info("Finish scan of [%s], count=%s" % (i_dir, count),
                     extra=self.d)

    def do_metadata_preserve(self, src_file, dest_file, st):
        """ create file node, copy attribute if needed."""
        if sys.platform == "darwin":  # Mac OS mknod() not permitted
            return

        try:
            mode = st.st_mode
            if not (st.st_mode & stat.S_IWUSR):
                # owner can't write, we will change mode first
                # then put it in optlist to fix
                mode = st.st_mode | stat.S_IWUSR
                self.optlist.append((dest_file, st))
            os.mknod(dest_file, mode)  # -r-r-r special
        except OSError as e:
            log.warn("mknod(): for %s, %s" % (dest_file, e), extra=self.d)
            return

        if G.preserve:
            self.copy_xattr(src_file, dest_file)

    def check_dest_exists(self, src_file, dest_file):
        """ return True if dest exists and checksum verified correct
            return False if (1) no overwrite (2) destination doesn't exist
        """
        if not self.force:
            return False

        if not os.path.exists(dest_file):
            return False

        # well, destination exists, now we have to check
        if self.sizeonly:
            if os.path.getsize(src_file) == os.path.getsize(dest_file):
                log.warn("Check sizeonly Okay: src: %s, dest=%s" %
                         (src_file, dest_file),
                         extra=self.d)
                return True
        elif self.checksum:
            raise NotImplementedError("Checksum comparison")

        try:
            os.unlink(dest_file)
        except OSError as e:
            log.warn("Can't unlink %s" % dest_file, extra=self.d)
        else:
            log.info("Retransfer: %s" % src_file, extra=self.d)

        return False

    def append_fitem(self, fitem):
        """
        if G.use_store:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                self.flist.mput(self.flist_buf)
                del self.flist_buf[:]

        else:
            self.flist.append(fitem)
        """
        if len(self.flist) < G.memitem_threshold:
            self.flist.append(fitem)
        else:
            self.flist_buf.append(fitem)
            if len(self.flist_buf) == G.DB_BUFSIZE:
                if self.use_store == False:
                    self.dbname = "%s/fwalk.%s" % (G.tempdir, self.circle.rank)
                    self.flist_db = DbStore(self.dbname)
                    self.use_store = True
                self.flist_db.mput(self.flist_buf)
                del self.flist_buf[:]

    def process(self):
        """ process a work unit, spath, dpath refers to
            source and destination respectively """

        fitem = self.circle.deq()
        spath = fitem.path
        if spath:
            try:
                st = os.lstat(spath)
            except OSError as e:
                log.warn(e, extra=self.d)
                self.skipped += 1
                return False

            fitem.st_mode, fitem.st_size, fitem.st_uid, fitem.st_gid = st.st_mode, st.st_size, st.st_uid, st.st_gid
            self.reduce_items += 1

            if os.path.islink(spath):
                self.append_fitem(fitem)
                self.sym_links += 1
                # if not self.follow_sym_links:
                # NOT TO FOLLOW SYM LINKS SHOULD BE THE DEFAULT
                return

            if stat.S_ISREG(st.st_mode):

                if not self.dest:
                    # fwalk without destination, simply add to process list
                    self.append_fitem(fitem)
                else:
                    # self.dest specified, need to check if it is there
                    dpath = destpath(fitem, self.dest)
                    flag = self.check_dest_exists(spath, dpath)
                    if flag:
                        return
                    else:
                        # if src and dest not the same
                        # including the case dest is not there
                        # then we do the following
                        self.append_fitem(fitem)
                        self.do_metadata_preserve(spath, dpath, st)
                self.cnt_files += 1
                self.cnt_filesize += fitem.st_size

            elif stat.S_ISDIR(st.st_mode):
                self.cnt_dirs += 1
                self.process_dir(fitem, st)
                # END OF if spath

    def tally(self, t):
        """ t is a tuple element of flist """
        if stat.S_ISDIR(t[1]):
            self.cnt_dirs += 1
        elif stat.S_ISREG(t[1]):
            self.cnt_files += 1
            self.cnt_filesize += t[2]

    def summarize(self):
        map(self.tally, self.flist)

    def reduce_init(self, buf):
        buf['cnt_files'] = self.cnt_files
        buf['cnt_dirs'] = self.cnt_dirs
        buf['cnt_filesize'] = self.cnt_filesize
        buf['reduce_items'] = self.reduce_items

    def reduce(self, buf1, buf2):
        buf1['cnt_dirs'] += buf2['cnt_dirs']
        buf1['cnt_files'] += buf2['cnt_files']
        buf1['cnt_filesize'] += buf2['cnt_filesize']
        buf1['reduce_items'] += buf2['reduce_items']
        return buf1

    def reduce_report(self, buf):
        # progress report
        # rate = (buf['cnt_files'] - self.last_cnt)/(MPI.Wtime() - self.last_reduce_time)
        # print("Processed objects: %s, estimated processing rate: %d/s" % (buf['cnt_files'], rate))
        # self.last_cnt = buf['cnt_files']

        rate = (buf['reduce_items'] - self.last_cnt) / (MPI.Wtime() -
                                                        self.last_reduce_time)
        print("Processed objects: %s, estimated processing rate: %d/s" %
              (buf['reduce_items'], rate))
        self.last_cnt = buf['reduce_items']
        self.last_reduce_time = MPI.Wtime()

    def reduce_finish(self, buf):
        # get result of reduction
        pass

    def total_tally(self):
        global taskloads
        T.total_dirs = self.circle.comm.allreduce(self.cnt_dirs, op=MPI.SUM)
        T.total_files = self.circle.comm.allreduce(self.cnt_files, op=MPI.SUM)
        T.total_filesize = self.circle.comm.allreduce(self.cnt_filesize,
                                                      op=MPI.SUM)
        T.total_symlinks = self.circle.comm.allreduce(self.sym_links,
                                                      op=MPI.SUM)
        T.total_skipped = self.circle.comm.allreduce(self.skipped, op=MPI.SUM)
        taskloads = self.circle.comm.gather(self.reduce_items)

    def epilogue(self):
        self.total_tally()
        self.time_ended = MPI.Wtime()

        if self.circle.rank == 0:
            print("\nFWALK Epilogue:\n")
            print("\t{:<20}{:<20}".format("Directory count:", T.total_dirs))
            print("\t{:<20}{:<20}".format("Sym Links count:",
                                          T.total_symlinks))
            print("\t{:<20}{:<20}".format("File count:", T.total_files))
            print("\t{:<20}{:<20}".format("Skipped count:", T.total_skipped))
            print("\t{:<20}{:<20}".format("Total file size:",
                                          bytes_fmt(T.total_filesize)))
            if T.total_files != 0:
                print("\t{:<20}{:<20}".format(
                    "Avg file size:",
                    bytes_fmt(T.total_filesize / float(T.total_files))))
            print("\t{:<20}{:<20}".format(
                "Tree talk time:",
                utils.conv_time(self.time_ended - self.time_started)))
            print("\t{:<20}{:<20}".format("Use store flist:",
                                          "%s" % self.use_store))
            print("\t{:<20}{:<20}".format("Use store workq:",
                                          "%s" % self.circle.use_store))
            print("\tFWALK Loads: %s" % taskloads)
            print("")

    def cleanup(self):
        if hasattr(self, "flist_db"):
            self.flist_db.cleanup()