def test_progress_bar(tmp_path, no_empty, fast, progress): """ User can provide a progress bar factory, in particular tqdm. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) # We use a real tqdm here, because it's more important to test that the # integration with tqdm is correct, than UI details like the descriptions # being useful. We also test with DummyBar, to check the claim that it # demonstrates the required interface for user-defined progress. bars = [] def tqdm_bar(*args, **kwargs): print(args, kwargs) bar = progress(*args, **kwargs) bars.append(bar) return bar dirinfo.populate(progress=tqdm_bar, no_empty=no_empty, fast=fast) # Make sure the progress bar was actually used: one progress bar for # reading the files, and one for hashing them. assert len(bars) == 2 if progress is tqdm: # Progress bar shows all files, not just those hashed. assert bars[0].n == len(files) if progress is tqdm: # Upper bound on time is quite arbitrary and could be relaxed. assert 0 <= time.time() - bars[0].start_t <= 1 assert 0 <= time.time() - bars[1].start_t <= 1 check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty, fast)
def test_fileinfo(tmp_path): """ Test the dataclass that stores filename/hash data. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dupes = set(itertools.chain.from_iterable(dupe_groups)) dupe_sizes = set(len(files[name]) for name in dupes) dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=True, fast=True) with_hash = without_hash = False for file in files: info = dirinfo.get_relative(file) if info.size in dupe_sizes: with_hash = True # Fields are tested by check_everything. Here we're interested in # the types. assert isinstance(info, FileStats) and isinstance(info, FileInfo) else: without_hash = True assert isinstance(info, FileStats) and not isinstance(info, FileInfo) # Test the addition of hash data to basic file stats. new_info = FileInfo.add_hash(info, b'hash') assert isinstance(new_info, FileInfo) assert new_info.hash == b'hash' now = datetime.now(tz=timezone.utc) assert now + timedelta(1) >= new_info.when >= now - timedelta(1) new_info = FileInfo.add_hash(info, b'hash', datetime(2000, 1, 1)) assert isinstance(new_info, FileInfo) assert new_info.hash == b'hash' assert new_info.when == datetime(2000, 1, 1) assert with_hash assert without_hash
def decorated( dirname: str, populate: bool, hash_empty: bool, fast: bool, resume: bool, threads: int, ): dirname = os.path.abspath(dirname) if resume or not populate: dirinfo = DirInfo.cached(dirname) if dirinfo.file_count == 0: populate = True else: dirinfo = DirInfo(dirname) if populate: dirinfo.populate( no_empty=not hash_empty, fast=fast, resume=resume, threads=threads, progress=tqdm, ) filename = dirinfo.save() tqdm.write(f'Written {filename}') return func(dirinfo)
def set_up(path, fast=False): path.mkdir() (path / 'dupe1.txt').write_bytes(b'equal') (path / 'dupe2.txt').write_bytes(b'equal') (path / 'unequal.txt').write_bytes(str(path).encode('utf8')) info = DirInfo(path) info.populate(fast=fast) info.save() return path
def test_dir_info(tmp_path): """ We can get a list of files in the directory, containing suitable info. Unless otherwise specified, this finds and hashes everything. """ file_size = 12345 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) dirinfo.populate() check_everything(file_size, subdir, files, dupe_groups, dirinfo)
def test_dir_info_optimisation(tmp_path, no_empty, fast): """ We can reduce the number of hashes computed by excluding empty files, and/or only hashing when file size matches. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=no_empty, fast=fast) check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty, fast)
def set_up(path, extra_files=()): path.mkdir() (path / 'equal1.txt').write_bytes(b'equal1') (path / 'equal2.txt').write_bytes(b'equal2') (path / 'unequal.txt').write_bytes(str(path).encode('utf8')) for extra in extra_files: (path / extra).write_bytes(b'extra') info = DirInfo(path) info.populate() info.save() return path
def test_resume(tmp_path): """ The 'resume' option to populate() avoids re-calculating hashes. """ file_size = 123 subdir, files, dupe_groups = tempfiles(tmp_path, file_size) empty_files = set(file for file, content in files.items() if len(content) == 0) assert 0 < len(empty_files) < len(files) non_empty_files = set(files) - empty_files dirinfo = DirInfo(subdir) dirinfo.populate(no_empty=True) # Remember the results first_results = {file: dirinfo.get_relative(file) for file in files} # Resume populating, with different params to include the empty file dirinfo.populate(resume=True) for file in empty_files: assert dirinfo.get_relative(file) is not first_results[file], file for file in non_empty_files: assert dirinfo.get_relative(file) is first_results[file], file # Now delete a file, and make sure it disappears. Cover the cases where # the deleted file has and hasn't been hashed. missing_files = ('big', 'unequal') dirinfo.populate(no_empty=True, fast=True) for filename in missing_files: (subdir / filename).unlink() dirinfo.populate(resume=True, fast=True) for filename in missing_files: with pytest.raises(KeyError): dirinfo.get_relative(filename), filename