def test_md5sum(self): """Check that the file_hash.md5sum function returns the right messages digest for some known test files.""" # Test "data/test_file.txt" file_path = os.path.join(DATA_DIRNAME, "test_file.txt") expected_str = "335387a52cfc8f1d3fda510b12dfc200" hex_str = file_hash.md5sum(file_path) self.assertEqual(hex_str, expected_str) # Test "data/test_file.bin" file_path = os.path.join(DATA_DIRNAME, "test_file.bin") expected_str = "8c1379aa4207f5b4b801a49b75e826c8" hex_str = file_hash.md5sum(file_path) self.assertEqual(hex_str, expected_str) # Test "data/test_file.empty" file_path = os.path.join(DATA_DIRNAME, "test_file.empty") expected_str = "d41d8cd98f00b204e9800998ecf8427e" hex_str = file_hash.md5sum(file_path) self.assertEqual(hex_str, expected_str)
def walk(root_path, db): """Walk the tree starting from "root_path" and build the {path:md5,...} dictionary""" local_file_dict = {} # dict = {path: md5, ...} local_dir_dict = {} # dict = {path: md5, ...} # current_dir_path = a string, the path to the directory. # dir_names = a list of the names (strings) of the subdirectories in # current_dir_path (excluding '.' and '..'). # file_names = a list of the names (strings) of the non-directory files # in current_dir_path. for current_dir_path, dir_names, file_names in os.walk(root_path, topdown=False, followlinks=False): # ABSOLUTE PATH OF current_dir_path current_dir_path = os.path.abspath(current_dir_path) # MAKE THE MD5 LIST OF CURRENT_DIR'S CONTENT (REQUIRED TO COMPUTE # CURRENT_DIR'S MD5) current_dir_md5_list = [] # CHILD FILES for file_name in file_names: file_path = os.path.join(current_dir_path, file_name) if not os.path.islink(file_path): file_mtime = os.path.getmtime(file_path) file_size = os.path.getsize(file_path) file_md5 = None if db is not None: if file_path in db: db_file_mtime, db_file_size, db_file_md5 = db[file_path].split() if file_mtime == db_file_mtime and file_size == db_file_size: # The file is known and hasn't changed since the # last walk => don't compute the MD5, use the one # in db. file_md5 = db_file_md5 if file_md5 is None: file_md5 = md5sum(file_path) if db is not None: db[file_path] = "{0} {1} {2}".format(file_mtime, file_size, file_md5) local_file_dict[file_path] = file_md5 current_dir_md5_list.append(file_md5) # else: # warnings.warn("ignore link " + file_path, UserWarning) # CHILD DIRECTORIES for dir_name in dir_names: dir_path = os.path.join(current_dir_path, dir_name) if not os.path.islink(dir_path): try: dir_md5 = local_dir_dict[dir_path] current_dir_md5_list.append(dir_md5) except KeyError: ## "local_dir_dict[dir_path]" should exists as we are doing a bottom-up tree walk #print 'Internal error. Check whether or not "topdown" argument is set to "False" in os.walk function call.' #print dir_path, "key doesn't exist in \"local_dir_dict\" dictionary." #sys.exit(4) warnings.warn("can't access " + dir_path, UserWarning) # else: # warnings.warn("ignore link " + dir_path, UserWarning) # CURRENT_DIRECTORY'S MD5 current_dir_md5_generator = hashlib.md5() # current_dir_md5_list have to be sorted because even for an identical # set of items, different order implies different MD5 current_dir_md5_list.sort() for item in current_dir_md5_list: current_dir_md5_generator.update(bytes(item, 'utf-8')) # TODO local_dir_dict[current_dir_path] = current_dir_md5_generator.hexdigest() return local_file_dict, local_dir_dict
def walk(root_path, db): """Walk the tree starting from "root_path" and build the {path:md5,...} dictionary""" local_file_dict = {} # dict = {path: md5, ...} local_dir_dict = {} # dict = {path: md5, ...} # current_dir_path = a string, the path to the directory. # dir_names = a list of the names (strings) of the subdirectories in # current_dir_path (excluding '.' and '..'). # file_names = a list of the names (strings) of the non-directory files # in current_dir_path. for current_dir_path, dir_names, file_names in os.walk(root_path, topdown=False, followlinks=False): # ABSOLUTE PATH OF current_dir_path current_dir_path = os.path.abspath(current_dir_path) # MAKE THE MD5 LIST OF CURRENT_DIR'S CONTENT (REQUIRED TO COMPUTE # CURRENT_DIR'S MD5) current_dir_md5_list = [] # CHILD FILES for file_name in file_names: file_path = os.path.join(current_dir_path, file_name) if not os.path.islink(file_path): file_mtime = os.path.getmtime(file_path) file_size = os.path.getsize(file_path) file_md5 = None if db is not None: if file_path in db: db_file_mtime, db_file_size, db_file_md5 = db[ file_path].split() if file_mtime == db_file_mtime and file_size == db_file_size: # The file is known and hasn't changed since the # last walk => don't compute the MD5, use the one # in db. file_md5 = db_file_md5 if file_md5 is None: file_md5 = md5sum(file_path) if db is not None: db[file_path] = "{0} {1} {2}".format( file_mtime, file_size, file_md5) local_file_dict[file_path] = file_md5 current_dir_md5_list.append(file_md5) # else: # warnings.warn("ignore link " + file_path, UserWarning) # CHILD DIRECTORIES for dir_name in dir_names: dir_path = os.path.join(current_dir_path, dir_name) if not os.path.islink(dir_path): try: dir_md5 = local_dir_dict[dir_path] current_dir_md5_list.append(dir_md5) except KeyError: ## "local_dir_dict[dir_path]" should exists as we are doing a bottom-up tree walk #print 'Internal error. Check whether or not "topdown" argument is set to "False" in os.walk function call.' #print dir_path, "key doesn't exist in \"local_dir_dict\" dictionary." #sys.exit(4) warnings.warn("can't access " + dir_path, UserWarning) # else: # warnings.warn("ignore link " + dir_path, UserWarning) # CURRENT_DIRECTORY'S MD5 current_dir_md5_generator = hashlib.md5() # current_dir_md5_list have to be sorted because even for an identical # set of items, different order implies different MD5 current_dir_md5_list.sort() for item in current_dir_md5_list: current_dir_md5_generator.update(bytes(item, 'utf-8')) # TODO local_dir_dict[current_dir_path] = current_dir_md5_generator.hexdigest( ) return local_file_dict, local_dir_dict