Beispiel #1
0
def test_fileinfo(tmp_path):
    """
    Test the dataclass that stores filename/hash data.
    """
    file_size = 123
    subdir, files, dupe_groups = tempfiles(tmp_path, file_size)
    dupes = set(itertools.chain.from_iterable(dupe_groups))
    dupe_sizes = set(len(files[name]) for name in dupes)
    dirinfo = DirInfo(subdir)
    dirinfo.populate(no_empty=True, fast=True)
    with_hash = without_hash = False
    for file in files:
        info = dirinfo.get_relative(file)
        if info.size in dupe_sizes:
            with_hash = True
            # Fields are tested by check_everything. Here we're interested in
            # the types.
            assert isinstance(info, FileStats) and isinstance(info, FileInfo)
        else:
            without_hash = True
            assert isinstance(info,
                              FileStats) and not isinstance(info, FileInfo)
            # Test the addition of hash data to basic file stats.
            new_info = FileInfo.add_hash(info, b'hash')
            assert isinstance(new_info, FileInfo)
            assert new_info.hash == b'hash'
            now = datetime.now(tz=timezone.utc)
            assert now + timedelta(1) >= new_info.when >= now - timedelta(1)
            new_info = FileInfo.add_hash(info, b'hash', datetime(2000, 1, 1))
            assert isinstance(new_info, FileInfo)
            assert new_info.hash == b'hash'
            assert new_info.when == datetime(2000, 1, 1)
    assert with_hash
    assert without_hash
Beispiel #2
0
def test_progress_bar(tmp_path, no_empty, fast, progress):
    """
    User can provide a progress bar factory, in particular tqdm.
    """
    file_size = 123
    subdir, files, dupe_groups = tempfiles(tmp_path, file_size)
    dirinfo = DirInfo(subdir)
    # We use a real tqdm here, because it's more important to test that the
    # integration with tqdm is correct, than UI details like the descriptions
    # being useful. We also test with DummyBar, to check the claim that it
    # demonstrates the required interface for user-defined progress.
    bars = []

    def tqdm_bar(*args, **kwargs):
        print(args, kwargs)
        bar = progress(*args, **kwargs)
        bars.append(bar)
        return bar

    dirinfo.populate(progress=tqdm_bar, no_empty=no_empty, fast=fast)
    # Make sure the progress bar was actually used: one progress bar for
    # reading the files, and one for hashing them.
    assert len(bars) == 2
    if progress is tqdm:
        # Progress bar shows all files, not just those hashed.
        assert bars[0].n == len(files)
    if progress is tqdm:
        # Upper bound on time is quite arbitrary and could be relaxed.
        assert 0 <= time.time() - bars[0].start_t <= 1
        assert 0 <= time.time() - bars[1].start_t <= 1
    check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty,
                     fast)
Beispiel #3
0
def test_dir_info(tmp_path):
    """
    We can get a list of files in the directory, containing suitable info.
    Unless otherwise specified, this finds and hashes everything.
    """
    file_size = 12345
    subdir, files, dupe_groups = tempfiles(tmp_path, file_size)
    dirinfo = DirInfo(subdir)
    dirinfo.populate()
    check_everything(file_size, subdir, files, dupe_groups, dirinfo)
Beispiel #4
0
def test_dir_info_optimisation(tmp_path, no_empty, fast):
    """
    We can reduce the number of hashes computed by excluding empty files,
    and/or only hashing when file size matches.
    """
    file_size = 123
    subdir, files, dupe_groups = tempfiles(tmp_path, file_size)
    dirinfo = DirInfo(subdir)
    dirinfo.populate(no_empty=no_empty, fast=fast)
    check_everything(file_size, subdir, files, dupe_groups, dirinfo, no_empty,
                     fast)
Beispiel #5
0
def compare(old_dirname: str, new_dirname: str, list_identical: bool,
            list_changed: bool, list_vanished: bool, list_added: bool,
            list_all: bool) -> None:
    """
    Summarise changes from OLD_DIRNAME to NEW_DIRNAME. Both directories must
    previously have been scanned (by the 'info' command or otherwise).

    The names OLD and NEW assume that you're comparing two snapshots of "the
    same" collection of files, for example two backups on different dates.
    """
    identical = set()
    changed = set()
    vanished = set()
    old_dir = DirInfo.load(old_dirname)
    new_dir = DirInfo.load(new_dirname)
    for old_file in old_dir:
        rel_str = old_file._rel_str
        try:
            new_file = new_dir.get_relative(rel_str)
        except KeyError:
            vanished.add(rel_str)
            continue
        if not isinstance(old_file, FileInfo):
            raise Exception(f'No hash for {old_file.fullpath}')
        if not isinstance(new_file, FileInfo):
            raise Exception(f'No hash for {new_file.fullpath}')
        if new_file.hash == old_file.hash:
            identical.add(rel_str)
        else:
            changed.add(rel_str)
    added = ({file._rel_str
              for file in new_dir}.difference(file._rel_str
                                              for file in old_dir))
    print('old:', old_dir.file_count)
    print('new:', new_dir.file_count)
    print('identical:', len(identical))
    print('changed: ', len(changed))
    print('vanished:', len(vanished))
    print('added:', len(added))
    if list_identical or list_all:
        print('\nidentical files:\n  ', end='')
        print('\n  '.join(sorted(identical)))
    if list_changed or list_all:
        print('\nchanged files:\n  ', end='')
        print('\n  '.join(sorted(changed)))
    if list_vanished or list_all:
        print('\nvanished files:\n  ', end='')
        print('\n  '.join(sorted(vanished)))
    if list_added or list_all:
        print('\nadded files:\n  ', end='')
        print('\n  '.join(sorted(added)))
Beispiel #6
0
def dupes(dirinfo: DirInfo) -> None:
    """Show groups of duplicate files in directory."""
    for group in dirinfo.dupe_groups():
        example = next(iter(group))
        hashcode = binascii.hexlify(example.hash).decode('ascii')
        print(
            f'{len(group)} duplicates with size {example.size}, hash {hashcode}'
        )
        for name in sorted(str(file.relpath) for file in group):
            print(f'  {name}')
Beispiel #7
0
 def decorated(
     dirname: str,
     populate: bool,
     hash_empty: bool,
     fast: bool,
     resume: bool,
     threads: int,
 ):
     dirname = os.path.abspath(dirname)
     if resume or not populate:
         dirinfo = DirInfo.cached(dirname)
         if dirinfo.file_count == 0:
             populate = True
     else:
         dirinfo = DirInfo(dirname)
     if populate:
         dirinfo.populate(
             no_empty=not hash_empty,
             fast=fast,
             resume=resume,
             threads=threads,
             progress=tqdm,
         )
         filename = dirinfo.save()
         tqdm.write(f'Written {filename}')
     return func(dirinfo)
Beispiel #8
0
 def set_up(path, fast=False):
     path.mkdir()
     (path / 'dupe1.txt').write_bytes(b'equal')
     (path / 'dupe2.txt').write_bytes(b'equal')
     (path / 'unequal.txt').write_bytes(str(path).encode('utf8'))
     info = DirInfo(path)
     info.populate(fast=fast)
     info.save()
     return path
Beispiel #9
0
 def set_up(path, extra_files=()):
     path.mkdir()
     (path / 'equal1.txt').write_bytes(b'equal1')
     (path / 'equal2.txt').write_bytes(b'equal2')
     (path / 'unequal.txt').write_bytes(str(path).encode('utf8'))
     for extra in extra_files:
         (path / extra).write_bytes(b'extra')
     info = DirInfo(path)
     info.populate()
     info.save()
     return path
Beispiel #10
0
def check_everything(file_size,
                     subdir,
                     files,
                     dupes,
                     info,
                     no_empty=False,
                     fast=False,
                     is_copy=False):
    def skip_hash(duplicated_sizes, actual_size):
        # Optionally some files aren't hashed.
        return ((no_empty and actual_size == 0)
                or (fast and actual_size not in duplicated_sizes))

    assert info.file_count == len(files) > 0
    for rel_path_str, content in files.items():
        record = info.get_relative(rel_path_str)
        assert record.basepath == subdir
        assert isinstance(record.relpath, Path)
        assert os.fspath(record.relpath) == rel_path_str
        assert record.size == len(content)
        if skip_hash([file_size], record.size):
            assert not hasattr(record, 'hash'), rel_path_str
        else:
            assert record.hash == hashlib.sha256(content).digest()
            # 10 seconds is arbitrary, but it *shouldn't* be that slow.
            assert record.when >= datetime.now(tz=timezone.utc) - timedelta(
                seconds=10)
    assert sorted(files.keys()) == sorted(file._rel_str for file in info)
    # Must notice that two files have the same hash
    dupe_groups = tuple(info.dupe_groups())
    assert len(dupe_groups) == len(dupes)
    sets = tuple(
        set(os.fspath(stats.relpath) for stats in group)
        for group in dupe_groups)
    assert sets == dupes
    if not is_copy:
        info.save()
        clone = DirInfo.load(subdir)
        check_everything(file_size,
                         subdir,
                         files,
                         dupes,
                         clone,
                         no_empty,
                         fast,
                         is_copy=True)
Beispiel #11
0
def test_serialisation(tmp_path):
    """
    Test failure modes. Success is tested in check_everything.
    """
    subdir = (tmp_path / 'sub')
    jsonfile = (tmp_path / 'sub.dirinfo.json')
    (subdir / 'dir').mkdir(parents=True)
    dirinfo = DirInfo(subdir)
    assert dirinfo.save() == os.fspath(jsonfile)
    # Not exactly a requirement, but for the tests to work we need this.
    assert jsonfile.exists()
    # If this fails, then testing that the bad cases fail is kind of pointless.
    assert DirInfo.load(subdir).base == os.fspath(subdir)
    # Make sure the encoder isn't accidentally used for something it can't handle.
    with pytest.raises(TypeError):
        json.dumps(object(), cls=Encoder)

    # Make sure bad json file contents are rejected
    def bad_jsonfile(jsondata):
        with open(jsonfile, 'w', encoding='utf8') as outfile:
            json.dump(jsondata, outfile)
        with pytest.raises(ValueError):
            DirInfo.load(subdir)

    bad_jsonfile({'foo': 'bar'})
    bad_jsonfile(None)

    # If the serialised base doesn't match the actual location, then something
    # is wrong and we should refuse to load it.
    assert dirinfo.save() == os.fspath(jsonfile)
    with open(jsonfile, 'r', encoding='utf8') as infile:
        jsondata = json.load(infile)
    jsondata['base'] += 'X'
    bad_jsonfile(jsondata)

    # If there's no data then load() fails, but cached() succeeds.
    jsonfile.unlink()
    with pytest.raises(FileNotFoundError):
        DirInfo.load(subdir)
    assert DirInfo.cached(subdir).base == subdir
Beispiel #12
0
 def bad_jsonfile(jsondata):
     with open(jsonfile, 'w', encoding='utf8') as outfile:
         json.dump(jsondata, outfile)
     with pytest.raises(ValueError):
         DirInfo.load(subdir)
Beispiel #13
0
def test_resume(tmp_path):
    """
    The 'resume' option to populate() avoids re-calculating hashes.
    """
    file_size = 123
    subdir, files, dupe_groups = tempfiles(tmp_path, file_size)
    empty_files = set(file for file, content in files.items()
                      if len(content) == 0)
    assert 0 < len(empty_files) < len(files)
    non_empty_files = set(files) - empty_files
    dirinfo = DirInfo(subdir)
    dirinfo.populate(no_empty=True)
    # Remember the results
    first_results = {file: dirinfo.get_relative(file) for file in files}
    # Resume populating, with different params to include the empty file
    dirinfo.populate(resume=True)
    for file in empty_files:
        assert dirinfo.get_relative(file) is not first_results[file], file
    for file in non_empty_files:
        assert dirinfo.get_relative(file) is first_results[file], file
    # Now delete a file, and make sure it disappears. Cover the cases where
    # the deleted file has and hasn't been hashed.
    missing_files = ('big', 'unequal')
    dirinfo.populate(no_empty=True, fast=True)
    for filename in missing_files:
        (subdir / filename).unlink()
    dirinfo.populate(resume=True, fast=True)
    for filename in missing_files:
        with pytest.raises(KeyError):
            dirinfo.get_relative(filename), filename
Beispiel #14
0
def info(dirinfo: DirInfo) -> None:
    """Read and summarize directory info."""
    files = plural(dirinfo.file_count, 'file')
    groups = plural(sum(1 for _ in dirinfo.dupe_groups()), 'dupe group')
    tqdm.write(f'Found {files} and {groups}')