def test_fileinfo(tmp_path): """ Test the dataclass that stores filename/hash data. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dupes = set(itertools.chain.from_iterable(dupe_groups)) dupe_sizes = set(len(files[name]) for name in dupes) dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=True, fast=True) with_hash = without_hash = False for file in files: info = dirinfo.get_relative(file) if info.size in dupe_sizes: with_hash = True # Fields are tested by check_everything. Here we're interested in # the types. assert isinstance(info, FileStats) and isinstance(info, FileInfo) else: without_hash = True assert isinstance(info, FileStats) and not isinstance(info, FileInfo) # Test the addition of hash data to basic file stats. new_info = FileInfo.add_hash(info, b'hash') assert isinstance(new_info, FileInfo) assert new_info.hash == b'hash' now = datetime.now(tz=timezone.utc) assert now + timedelta(1) >= new_info.when >= now - timedelta(1) new_info = FileInfo.add_hash(info, b'hash', datetime(2000, 1, 1)) assert isinstance(new_info, FileInfo) assert new_info.hash == b'hash' assert new_info.when == datetime(2000, 1, 1) assert with_hash assert without_hash
def test_progress_bar(tmp_path, no_empty, fast, progress): """ User can provide a progress bar factory, in particular tqdm. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) # We use a real tqdm here, because it's more important to test that the # integration with tqdm is correct, than UI details like the descriptions # being useful. We also test with DummyBar, to check the claim that it # demonstrates the required interface for user-defined progress. bars = [] def tqdm_bar(*args, **kwargs): print(args, kwargs) bar = progress(*args, **kwargs) bars.append(bar) return bar dirinfo.populate(progress=tqdm_bar, no_empty=no_empty, fast=fast) # Make sure the progress bar was actually used: one progress bar for # reading the files, and one for hashing them. assert len(bars) == 2 if progress is tqdm: # Progress bar shows all files, not just those hashed. assert bars[0].n == len(files) if progress is tqdm: # Upper bound on time is quite arbitrary and could be relaxed. assert 0 <= time.time() - bars[0].start_t <= 1 assert 0 <= time.time() - bars[1].start_t <= 1 check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty, fast)
def test_dir_info(tmp_path): """ We can get a list of files in the directory, containing suitable info. Unless otherwise specified, this finds and hashes everything. """ file_size = 12345 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) dirinfo.populate() check_everything(file_size, subdir, files, dupe_groups, dirinfo)
def test_dir_info_optimisation(tmp_path, no_empty, fast): """ We can reduce the number of hashes computed by excluding empty files, and/or only hashing when file size matches. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=no_empty, fast=fast) check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty, fast)
def compare(old_dirname: str, new_dirname: str, list_identical: bool, list_changed: bool, list_vanished: bool, list_added: bool, list_all: bool) -> None: """ Summarise changes from OLD_DIRNAME to NEW_DIRNAME. Both directories must previously have been scanned (by the 'info' command or otherwise). The names OLD and NEW assume that you're comparing two snapshots of "the same" collection of files, for example two backups on different dates. """ identical = set() changed = set() vanished = set() old_dir = DirInfo.load(old_dirname) new_dir = DirInfo.load(new_dirname) for old_file in old_dir: rel_str = old_file._rel_str try: new_file = new_dir.get_relative(rel_str) except KeyError: vanished.add(rel_str) continue if not isinstance(old_file, FileInfo): raise Exception(f'No hash for {old_file.fullpath}') if not isinstance(new_file, FileInfo): raise Exception(f'No hash for {new_file.fullpath}') if new_file.hash == old_file.hash: identical.add(rel_str) else: changed.add(rel_str) added = ({file._rel_str for file in new_dir}.difference(file._rel_str for file in old_dir)) print('old:', old_dir.file_count) print('new:', new_dir.file_count) print('identical:', len(identical)) print('changed: ', len(changed)) print('vanished:', len(vanished)) print('added:', len(added)) if list_identical or list_all: print('\nidentical files:\n ', end='') print('\n '.join(sorted(identical))) if list_changed or list_all: print('\nchanged files:\n ', end='') print('\n '.join(sorted(changed))) if list_vanished or list_all: print('\nvanished files:\n ', end='') print('\n '.join(sorted(vanished))) if list_added or list_all: print('\nadded files:\n ', end='') print('\n '.join(sorted(added)))
def dupes(dirinfo: DirInfo) -> None: """Show groups of duplicate files in directory.""" for group in dirinfo.dupe_groups(): example = next(iter(group)) hashcode = binascii.hexlify(example.hash).decode('ascii') print( f'{len(group)} duplicates with size {example.size}, hash {hashcode}' ) for name in sorted(str(file.relpath) for file in group): print(f' {name}')
def decorated( dirname: str, populate: bool, hash_empty: bool, fast: bool, resume: bool, threads: int, ): dirname = os.path.abspath(dirname) if resume or not populate: dirinfo = DirInfo.cached(dirname) if dirinfo.file_count == 0: populate = True else: dirinfo = DirInfo(dirname) if populate: dirinfo.populate( no_empty=not hash_empty, fast=fast, resume=resume, threads=threads, progress=tqdm, ) filename = dirinfo.save() tqdm.write(f'Written {filename}') return func(dirinfo)
def set_up(path, fast=False): path.mkdir() (path / 'dupe1.txt').write_bytes(b'equal') (path / 'dupe2.txt').write_bytes(b'equal') (path / 'unequal.txt').write_bytes(str(path).encode('utf8')) info = DirInfo(path) info.populate(fast=fast) info.save() return path
def set_up(path, extra_files=()): path.mkdir() (path / 'equal1.txt').write_bytes(b'equal1') (path / 'equal2.txt').write_bytes(b'equal2') (path / 'unequal.txt').write_bytes(str(path).encode('utf8')) for extra in extra_files: (path / extra).write_bytes(b'extra') info = DirInfo(path) info.populate() info.save() return path
def check_everything(file_size, subdir, files, dupes, info, no_empty=False, fast=False, is_copy=False): def skip_hash(duplicated_sizes, actual_size): # Optionally some files aren't hashed. return ((no_empty and actual_size == 0) or (fast and actual_size not in duplicated_sizes)) assert info.file_count == len(files) > 0 for rel_path_str, content in files.items(): record = info.get_relative(rel_path_str) assert record.basepath == subdir assert isinstance(record.relpath, Path) assert os.fspath(record.relpath) == rel_path_str assert record.size == len(content) if skip_hash([file_size], record.size): assert not hasattr(record, 'hash'), rel_path_str else: assert record.hash == hashlib.sha256(content).digest() # 10 seconds is arbitrary, but it *shouldn't* be that slow. assert record.when >= datetime.now(tz=timezone.utc) - timedelta( seconds=10) assert sorted(files.keys()) == sorted(file._rel_str for file in info) # Must notice that two files have the same hash dupe_groups = tuple(info.dupe_groups()) assert len(dupe_groups) == len(dupes) sets = tuple( set(os.fspath(stats.relpath) for stats in group) for group in dupe_groups) assert sets == dupes if not is_copy: info.save() clone = DirInfo.load(subdir) check_everything(file_size, subdir, files, dupes, clone, no_empty, fast, is_copy=True)
def test_serialisation(tmp_path): """ Test failure modes. Success is tested in check_everything. """ subdir = (tmp_path / 'sub') jsonfile = (tmp_path / 'sub.dirinfo.json') (subdir / 'dir').mkdir(parents=True) dirinfo = DirInfo(subdir) assert dirinfo.save() == os.fspath(jsonfile) # Not exactly a requirement, but for the tests to work we need this. assert jsonfile.exists() # If this fails, then testing that the bad cases fail is kind of pointless. assert DirInfo.load(subdir).base == os.fspath(subdir) # Make sure the encoder isn't accidentally used for something it can't handle. with pytest.raises(TypeError): json.dumps(object(), cls=Encoder) # Make sure bad json file contents are rejected def bad_jsonfile(jsondata): with open(jsonfile, 'w', encoding='utf8') as outfile: json.dump(jsondata, outfile) with pytest.raises(ValueError): DirInfo.load(subdir) bad_jsonfile({'foo': 'bar'}) bad_jsonfile(None) # If the serialised base doesn't match the actual location, then something # is wrong and we should refuse to load it. assert dirinfo.save() == os.fspath(jsonfile) with open(jsonfile, 'r', encoding='utf8') as infile: jsondata = json.load(infile) jsondata['base'] += 'X' bad_jsonfile(jsondata) # If there's no data then load() fails, but cached() succeeds. jsonfile.unlink() with pytest.raises(FileNotFoundError): DirInfo.load(subdir) assert DirInfo.cached(subdir).base == subdir
def bad_jsonfile(jsondata): with open(jsonfile, 'w', encoding='utf8') as outfile: json.dump(jsondata, outfile) with pytest.raises(ValueError): DirInfo.load(subdir)
def test_resume(tmp_path): """ The 'resume' option to populate() avoids re-calculating hashes. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) empty_files = set(file for file, content in files.items() if len(content) == 0) assert 0 < len(empty_files) < len(files) non_empty_files = set(files) - empty_files dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=True) # Remember the results first_results = {file: dirinfo.get_relative(file) for file in files} # Resume populating, with different params to include the empty file dirinfo.populate(resume=True) for file in empty_files: assert dirinfo.get_relative(file) is not first_results[file], file for file in non_empty_files: assert dirinfo.get_relative(file) is first_results[file], file # Now delete a file, and make sure it disappears. Cover the cases where # the deleted file has and hasn't been hashed. missing_files = ('big', 'unequal') dirinfo.populate(no_empty=True, fast=True) for filename in missing_files: (subdir / filename).unlink() dirinfo.populate(resume=True, fast=True) for filename in missing_files: with pytest.raises(KeyError): dirinfo.get_relative(filename), filename
def info(dirinfo: DirInfo) -> None: """Read and summarize directory info.""" files = plural(dirinfo.file_count, 'file') groups = plural(sum(1 for _ in dirinfo.dupe_groups()), 'dupe group') tqdm.write(f'Found {files} and {groups}')