def handle_file_or_dir(self, spath, st): if stat.S_ISREG(st.st_mode): incr_local_histogram(st.st_size) if args.gpfs_block_alloc: inodesz = utils.conv_unit(args.inodesz) gpfs_block_update(st.st_size, inodesz) if args.top: update_topn(TopFile(st.st_size, spath)) if self.outfile: self.fszlst.append(st.st_size) if len(self.fszlst) >= FSZMAX: for ele in self.fszlst: self.outfile.write("%d\n" % ele) self.fszlst = [] self.cnt_files += 1 self.cnt_filesize += st.st_size if args.profdev and utils.is_dev_file(spath): self.devfile_cnt += 1 self.devfile_sz += st.st_size # check hard links if st.st_nlink > 1: self.nlinks += st.st_nlink self.nlinked_files += 1 elif stat.S_ISDIR(st.st_mode): self.cnt_dirs += 1 self.process_dir(spath, st)
def main(): global args, comm signal.signal(signal.SIGINT, sig_handler) args = parse_and_bcast(comm, gen_parser) try: G.src = utils.check_src(args.path) except ValueError as e: err_and_exit("Error: %s not accessible" % e) G.loglevel = args.loglevel #G.use_store = args.use_store G.reduce_interval = args.interval G.memitem_threshold = args.item hosts_cnt = tally_hosts() circle = Circle() if circle.rank == 0: print("Running Parameters:\n") print("\t{:<20}{:<20}".format("FSUM version:", __version__)) print("\t{:<20}{:<20}".format("Num of hosts:", hosts_cnt)) print("\t{:<20}{:<20}".format("Num of processes:", MPI.COMM_WORLD.Get_size())) print("\t{:<20}{:<20}".format("Root path:", utils.choplist(G.src))) print("\t{:<20}{:<20}".format("Items in memory:", G.memitem_threshold)) fwalk = FWalk(circle, G.src) circle.begin(fwalk) if G.use_store: fwalk.flushdb() fwalk.epilogue() circle.finalize() # by default, we use adaptive chunksize chunksize = utils.calc_chunksize(T.total_filesize) if args.chunksize: chunksize = conv_unit(args.chunksize) if circle.rank == 0: print("Chunksize = ", chunksize) circle = Circle() fcheck = Checksum(circle, fwalk, chunksize, T.total_filesize, T.total_files) circle.begin(fcheck) circle.finalize() if circle.rank == 0: sys.stdout.write("\nAggregating ... ") """ chunkl = circle.comm.gather(fcheck.chunkq) if circle.rank == 0: chunks = [item for sublist in chunkl for item in sublist] chunks.sort() sys.stdout.write("%s chunks\n" % len(chunks)) sha1val = do_checksum(chunks) with open(args.output, "w") as f: f.write("sha1: %s\n" % sha1val) f.write("chunksize: %s\n" % chunksize) f.write("fwalk version: %s\n" % __version__) f.write("src: %s\n" % utils.choplist(G.src)) f.write("date: %s\n" % utils.current_time()) f.write("totalsize: %s\n" % T.total_filesize) print("\nSHA1: %s" % sha1val) print("Signature file: [%s]" % args.output) if args.export_block_signatures: export_checksum2(chunks, args.output) print("Exporting block signatures ... \n") """ if circle.rank > 0: circle.comm.send(fcheck.bfsign.bitarray, dest=0) else: for p in xrange(1, circle.comm.size): other_bitarray = circle.comm.recv(source=p) fcheck.bfsign.or_bf(other_bitarray) circle.comm.Barrier() if circle.comm.rank == 0: sha1val = fcheck.bfsign.gen_signature() with open(args.output, "w") as f: f.write("sha1: %s\n" % sha1val) f.write("chunksize: %s\n" % chunksize) f.write("fwalk version: %s\n" % __version__) f.write("src: %s\n" % utils.choplist(G.src)) f.write("date: %s\n" % utils.current_time()) f.write("totalsize: %s\n" % T.total_filesize) print("\nSHA1: %s" % sha1val) print("Signature file: [%s]" % args.output) fcheck.epilogue() if circle.comm.rank == 0: if os.path.exists(G.tempdir): shutil.rmtree(G.tempdir, ignore_errors=True)
def main(): global args, comm signal.signal(signal.SIGINT, sig_handler) args = parse_and_bcast(comm, gen_parser) try: G.src = utils.check_src(args.path) except ValueError as e: err_and_exit("Error: %s not accessible" % e) G.loglevel = args.loglevel G.use_store = args.use_store G.reduce_interval = args.interval hosts_cnt = tally_hosts() circle = Circle() if circle.rank == 0: print("Running Parameters:\n") print("\t{:<20}{:<20}".format("FSUM version:", __version__)) print("\t{:<20}{:<20}".format("Num of hosts:", hosts_cnt)) print("\t{:<20}{:<20}".format("Num of processes:", MPI.COMM_WORLD.Get_size())) print("\t{:<20}{:<20}".format("Root path:", utils.choplist(G.src))) fwalk = FWalk(circle, G.src) circle.begin(fwalk) if G.use_store: fwalk.flushdb() totalsize = fwalk.epilogue() circle.finalize() # by default, we use adaptive chunksize chunksize = utils.calc_chunksize(totalsize) if args.chunksize: chunksize = conv_unit(args.chunksize) if circle.rank == 0: print("Chunksize = ", chunksize) circle = Circle() fcheck = Checksum(circle, fwalk, chunksize, totalsize) circle.begin(fcheck) circle.finalize() if circle.rank == 0: sys.stdout.write("\nAggregating ... ") chunkl = circle.comm.gather(fcheck.chunkq) if circle.rank == 0: chunks = [item for sublist in chunkl for item in sublist] chunks.sort() sys.stdout.write("%s chunks\n" % len(chunks)) sha1val = do_checksum(chunks) with open(args.output, "w") as f: f.write("sha1: %s\n" % sha1val) f.write("chunksize: %s\n" % chunksize) f.write("fwalk version: %s\n" % __version__) f.write("src: %s\n" % utils.choplist(G.src)) f.write("date: %s\n" % utils.current_time()) f.write("totalsize: %s\n" % totalsize) print("\nSHA1: %s" % sha1val) print("Signature file: [%s]" % args.output) if args.export_block_signatures: export_checksum2(chunks, args.output) print("Exporting block signatures ... \n") fcheck.epilogue()
def set_chunksize(pcp, tsz): if args.adaptive: pcp.set_adaptive_chunksize(tsz) else: pcp.set_fixed_chunksize(utils.conv_unit(args.chunksize))
def main(): global comm, args, stripe_out, DIR_BINS, DIR_HIST fpipe.listen() args = parse_and_bcast(comm, gen_parser) try: G.src = utils.check_src2(args.path) except ValueError as e: err_and_exit("Error: %s not accessible" % e) G.memitem_threshold = args.item G.loglevel = args.loglevel hosts_cnt = tally_hosts() # doing directory profiling? if args.dirprof: # check the input if args.dirbins is None: # err_and_exit("Error: missing directory bin parameters: a sorted integer list\n") args.dirbins = [ 0, 10, 100, 1000, 10**4, 10**5, 10**6, 10**7, 10**8 ] else: myList = sorted(set(args.dirbins)) if myList != args.dirbins: err_and_exit("Error: duplicated, or unsorted bins: %s\n" % args.dirbins) DIR_BINS = args.dirbins DIR_HIST = [0] * (len(DIR_BINS) + 1) # Doing stripe analysis? lfs is not really bullet-proof way # we might need a better way of doing fstype check. if args.lustre_stripe: G.lfs_bin = lfs.check_lfs() G.stripe_threshold = utils.conv_unit(args.stripe_threshold) try: stripe_out = os.open(args.stripe_output, os.O_CREAT | os.O_WRONLY | os.O_APPEND) except: err_and_exit("Error: can't create stripe output: %s" % args.stripe_output) if args.exclude: process_exclude_file() if comm.rank == 0: print("Running Parameters:\n") print("\t{0:<20}{1:<20}".format("fprof version:", __version__)) print("\t{0:<20}{1:<20}".format("Full rev id:", __revid__)) print("\t{0:<20}{1:<20}".format("Num of hosts:", hosts_cnt)) print("\t{0:<20}{1:<20}".format("Num of processes:", MPI.COMM_WORLD.Get_size())) if args.syslog: print("\t{0:<20}{1:<20}".format("Syslog report: ", "yes")) else: print("\t{0:<20}{1:<20}".format("Syslog report: ", "no")) if args.dirprof: print("\t{0:<20}{1:<20}".format("Dir bins: ", args.dirbins)) if args.lustre_stripe: print("\t{0:<20}{1:<20}".format("Stripe analysis: ", "yes")) print("\t{0:<20}{1:<20}".format("Stripe threshold: ", args.stripe_threshold)) else: print("\t{0:<20}{1:<20}".format("Stripe analysis: ", "no")) print("\t{0:<20}{1:<20}".format("Root path:", G.src)) if args.exclude: print("\nExclusions:\n") for ele in EXCLUDE: print("\t %s" % ele) circle = Circle() if args.perprocess: circle.report_enabled = True else: circle.report_enabled = False if args.progress: circle.report_enabled = False circle.reduce_enabled = True treewalk = ProfileWalk(circle, G.src, perfile=args.perfile) circle.begin(treewalk) # we need the total file size to calculate GPFS efficiency total_file_size = treewalk.epilogue() msg1, msg2 = gen_histogram(total_file_size) if args.dirprof: gen_directory_histogram() if comm.rank == 0 and args.syslog: sendto_syslog("fprof.filecount.hist", msg1) sendto_syslog("fprof.fsize_perc.hist", msg2) if args.topn_files: topfiles = gather_topfiles() if comm.rank == 0: print("\nTop N File Report:\n") # edge case: not enough files (< args.top) totaln = args.topn_files if len( topfiles) > args.topn_files else len(topfiles) for index, _ in enumerate(xrange(totaln)): size, path = topfiles[index] print("\t%s: %s (%s)" % (index + 1, path, utils.bytes_fmt(size))) print("") if args.topn_dirs: topdirs = gather_topdirs() if comm.rank == 0: print("\nTop N Directory Report:\n") totaln = args.topn_dirs if len(topdirs) > args.topn_dirs else len( topdirs) for index, _ in enumerate(xrange(totaln)): size, path = topdirs[index] print("\t{0:}: {1:} ({2:,} items)".format( index + 1, path, size)) print("") if args.gpfs_block_alloc: gpfs_blocks = gather_gpfs_blocks() gather_gpfs_dii() if comm.rank == 0: print("\nGPFS Block Alloc Report:\n") print("\t{0:<15}{1:<4}".format("inode size:", args.inodesz)) print("\t{0:<25}{1:>15,}".format("DII (data-in-inode) count:", DII_COUNT)) print("\tSubblocks: %s\n" % gpfs_blocks) fmt_msg = "\tBlocksize: {0:<6} Estimated Space: {1:<20s} Efficiency: {2:>6.2%}" for idx, bsz in enumerate(G.gpfs_block_size): gpfs_file_size = gpfs_blocks[idx] * G.gpfs_subs[idx] if gpfs_file_size != 0: print( fmt_msg.format(bsz, bytes_fmt(gpfs_file_size), total_file_size / float(gpfs_file_size))) else: print(fmt_msg.format(bsz, bytes_fmt(gpfs_file_size), 0)) treewalk.cleanup() circle.finalize() if args.lustre_stripe and stripe_out: os.close(stripe_out) sp_workload = comm.gather(Tally.spcnt) if comm.rank == 0: print("Stripe workload total: %s, distribution: %s" % (sum(sp_workload), sp_workload))
def handle_file_or_dir(self, spath, st): if stat.S_ISREG(st.st_mode): # check sparse file # TODO: check why st_blksize * st_blocks is wrong. fsize = st.st_size if st.st_size == 0: self.cnt_0byte += 1 if args.verbose == 2: self.logger.info("ZERO-byte file: %s" % spath, extra=self.d) # check compression saving if args.cpr: self.cnt_blocks += st.st_blocks if args.cpr_per_file: uncompressed = float(st.st_size) compressed = float(st.st_blocks * 512) if st.st_size != 0: ratio = uncompressed / compressed self.logger.info( "Compression: %s: (nblocks: %s, fsize: %s, ratio: %0.2f)" % (spath, st.st_blocks, st.st_size, ratio), extra=self.d) # if stat filesize is not crazy, we count it as uncompressed filesize # part of this is due to LLNL's sparse EB file, which skews the result if st.st_size <= G.FSZ_BOUND: self.cnt_stat_filesize += st.st_size if st.st_blocks * 512 < st.st_size: self.sparse_cnt += 1 fsize = st.st_blocks * 512 if args.sparse: print("\tSparse file:\t %s" % spath) print("\t\t\t st_blocks: %s, st_size: %s" % (st.st_blocks, st.st_size)) incr_local_histogram(fsize) if args.gpfs_block_alloc: if args.dii: inodesz = utils.conv_unit(args.inodesz) else: inodesz = 0 gpfs_block_update(fsize, inodesz) if args.topn_files: update_topn_files(TopFile(fsize, spath)) if self.outfile: self.fszlst.append(fsize) if len(self.fszlst) >= FSZMAX: for ele in self.fszlst: self.outfile.write("%d\n" % ele) self.fszlst = [] self.cnt_files += 1 self.cnt_filesize += fsize if args.profdev and utils.is_dev_file(spath): self.devfile_cnt += 1 self.devfile_sz += fsize # check hard links if st.st_nlink > 1: self.nlinks += st.st_nlink self.nlinked_files += 1 # stripe analysis if args.lustre_stripe and fsize > G.stripe_threshold: # path, size, stripe_count try: with timeout(seconds=5): stripe_count = lfs.lfs_get_stripe(G.lfs_bin, spath) except OSError as e: self.logger.warn(e, extra=self.d) except TimeoutError as e: self.logger.error("%s when lfs getstripe on %s" % (e, spath), extra=self.d) else: if stripe_count: os.write( stripe_out, "%-4s, %-10s, %s\n" % (stripe_count, fsize, spath)) Tally.spcnt += 1 else: self.logger.error("Failed to read stripe info: %s" % spath, extra=self.d) elif stat.S_ISDIR(st.st_mode): self.cnt_dirs += 1 self.process_dir(spath, st)