def find_cross_duplicates(mtree_shelvename, cmtree_shelvename, write_rm_list=None): """locate all of the checksums on one volume in a comparison volume write_rm_list optionally prints rm commands to delete anything that exists somewhere on the comparison volume. rm_list is only files, but to prune emtpy directories, issue: find <parent-dir> -depth -type d -empty -exec rmdir -v {} \; """ mtree_shelve = shelve.open(mtree_shelvename, 'r') tree = mtree_shelve['tree'] leaves = mtree_shelve['leaves'] mtree_shelve.close() parent_tree = utils.make_parent_tree(tree) volhash = utils.make_hash_index(parent_tree, leaves) cmtree_shelve = shelve.open(cmtree_shelvename, 'r') ctree = cmtree_shelve['tree'] cleaves = cmtree_shelve['leaves'] cmtree_shelve.close() parent_ctree = utils.make_parent_tree(ctree) cvolhash = utils.make_hash_index(parent_ctree, cleaves) if write_rm_list: rmlistfile = open(write_rm_list, 'w') for key, filelist in volhash.iteritems(): if key in cvolhash: cfilelist = cvolhash[key] print key + "-" * 48 for filename in filelist: print "%s" % filename print "has duplicate file(s) in the comparison volume: " for filename in cfilelist: print "%s" % filename if write_rm_list: for filename in filelist: rmlistfile.write("rm -fv %s\n" % filename) if write_rm_list: rmlistfile.close()
def find_duplicates(tree_shelvename): mtree_shelve = shelve.open(mtree_shelvename, 'r') tree = mtree_shelve['tree'] leaves = mtree_shelve['leaves'] mtree_shelve.close() parent_tree = utils.make_parent_tree(tree) volhash = utils.make_hash_index(parent_tree, leaves) for key, filelist in volhash.iteritems(): print key + "-" * 48 for filename in filelist: print "%s" % filename
def find_largest_common_directories(mtree_shelvename, print_size_only=False, exclude_list=[]): """find the largest directories that share the same checksum of all data under them """ mtree_shelve = shelve.open(mtree_shelvename, 'r') tree = mtree_shelve['tree'] leaves = mtree_shelve['leaves'] mtree_shelve.close() parent_tree = utils.make_parent_tree(tree) md5dict = utils.make_hash_index(parent_tree, leaves, entry_type="dir") md5_size_dict = {} for md5_key in md5dict.keys(): file_size = [] for entry in md5dict[md5_key]: file_size.append(entry["tree_size"]) file_size = list(set(file_size)) if (len(file_size) > 1): print "ERROR: accounting for size of directories " +\ "with with hash %s failed" % md5_key file_size = file_size[0] md5_size_dict[md5_key] = file_size cutcount = 0 reduced_md5_size_dict = copy.deepcopy(md5_size_dict) for md5_key in md5_size_dict: if (len(md5dict[md5_key]) > 1): md5cutlist = [] for entry in md5dict[md5_key]: md5under_path = utils.hashes_under_tree(tree, leaves, entry["leaf_number"]) md5cutlist.append(set(md5under_path)) combined_cutlist = md5cutlist[0] for md5list in md5cutlist: combined_cutlist.intersection(md5list) combined_cutlist = list(combined_cutlist) for cutmd5 in combined_cutlist: if cutmd5 in reduced_md5_size_dict: cutcount += 1 del reduced_md5_size_dict[cutmd5] print "number of trees under a duplicate tree cut: %d" % (cutcount) total_duplicated_size = 0 for key, value in sorted(reduced_md5_size_dict.iteritems(), key=lambda (k, v): (v, k), reverse=True): if (len(md5dict[key]) > 1): total_duplicated_size += (len(md5dict[key]) - 1) * value if print_size_only: print value else: print "-" * 80 print "%s: %d" % (key, value) for entry in md5dict[key]: full_pathname = utils.reconstruct_pathname(parent_tree, leaves, int(entry["leaf_number"])) if not any(excluded in full_pathname for excluded in exclude_list): print full_pathname #else: # print "#: " + full_pathname print "data volume in duplicated directories %d" % total_duplicated_size