Exemple #1
0
    def from_file(
        cls: Type[PF],
        source_path: Union[str, PathLike],
        algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
        tz_default: Optional[tzinfo] = None,
        priority: int = 10,
    ) -> PF:
        """Create a PhotoFile for a given file

        :param source_path: The path to the file
        :param algorithm: The hashing algorithm to use
        :param tz_default: The time zone to use if none is set
            (defaults to local time)
        :param priority: The photo's priority
        """
        photo_hash = file_checksum(source_path, algorithm)
        dt_str = get_media_datetime(source_path)
        dt = datetime_str_to_object(dt_str, tz_default=tz_default)
        tz = dt.utcoffset().total_seconds() if dt.tzinfo is not None else local_tzoffset
        timestamp = dt.timestamp()
        file_size = getsize(source_path)
        return cls(
            chk=photo_hash,
            src=str(source_path),
            dt=dt_str,
            ts=timestamp,
            fsz=file_size,
            sto="",
            prio=priority,
            tzo=tz,
        )
Exemple #2
0
    def from_file_cached(
        cls: Type[PF],
        source_path: str,
        checksum_cache: dict[str, str],
        datetime_cache: dict[str, str],
        algorithm: HashAlgorithm = DEFAULT_HASH_ALGO,
        tz_default: Optional[tzinfo] = None,
        priority: int = 10,
    ) -> PF:
        """Create a PhotoFile for a given file

        If source_path is in the checksum and datetime caches, uses the cached value
        instead of reading from the file.

        :param source_path: The path to the file
        :param checksum_cache: A mapping of source paths to known checksums
        :param datetime_cache: A mapping of source paths to datetime strings
        :param algorithm: The hashing algorithm to use for new checksums
        :param tz_default: The time zone to use if none is set
            (defaults to local time)
        :param priority: The photo's priority
        """
        photo_hash = (
            checksum_cache[source_path]
            if source_path in checksum_cache
            else file_checksum(source_path, algorithm)
        )
        dt_str = (
            datetime_cache[source_path]
            if source_path in datetime_cache
            else get_media_datetime(source_path)
        )
        dt = datetime_str_to_object(dt_str, tz_default=tz_default)
        tz = dt.utcoffset().total_seconds() if dt.tzinfo else None
        timestamp = dt.timestamp()
        file_size = getsize(source_path)
        return cls(
            chk=photo_hash,
            src=str(source_path),
            dt=dt_str,
            ts=timestamp,
            fsz=file_size,
            sto="",
            prio=priority,
            tzo=tz,
        )
Exemple #3
0
def make_hash_map(
    database: Database,
    new_algo: HashAlgorithm,
    hash_map: Optional[dict[str, str]] = None,
    destination: Optional[Union[str, PathLike]] = None,
) -> dict[str, str]:  # pragma: no cover
    """Make a map of file checksums in order to migrate hashing algorithms.

    Checks source file hashes using the old algorithm to make sure the new hashes
    are correct. If the source has an incorrect hash, does not map checksum and
    instead sets the mapped checksum to '{old_checksum}:{old_algorithm}'

    Note: This method is not accessed by the CLI or covered by testing
    and is intended to be used interactively, i.e. in a Jupyter notebook
    or other environment, with spot checking of the output hash map.

    :param database: the Database
    :param new_algo: the new algorithm to use
    :param hash_map: a map from old hashes to new hashes; will be updated with
        new mappings as they are found
    :param destination: the library storage destination
    :return: the hash map
    """
    if hash_map is None:
        hash_map = {}
    old_algo = database.hash_algorithm
    print(f"Converting {old_algo} to {new_algo}")
    num_correct_photos = (
        num_incorrect_photos) = num_missing_photos = num_skipped_photos = 0
    all_photos = [
        photo for photos in database.photo_db.values() for photo in photos
    ]
    for photo in tqdm(all_photos):
        if photo.chk in hash_map:
            num_skipped_photos += 1
        elif exists(photo.src):
            if photo.chk == file_checksum(photo.src, old_algo):
                hash_map[photo.chk] = file_checksum(photo.src, new_algo)
                num_correct_photos += 1
            else:
                tqdm.write(f"Incorrect checksum: {photo.src}")
                hash_map[photo.chk] = photo.chk + f":{old_algo}"
                num_incorrect_photos += 1
        elif destination:
            sto_path = Path(destination).expanduser().resolve() / photo.sto
            if exists(sto_path) and photo.chk == file_checksum(
                    sto_path, old_algo):
                hash_map[photo.chk] = file_checksum(sto_path, new_algo)
                num_correct_photos += 1
            else:
                num_missing_photos += 1
        else:
            num_missing_photos += 1

    print(f"Mapped {num_correct_photos} items")
    if num_skipped_photos:
        print(f"Skipped {num_skipped_photos} items")
    if num_incorrect_photos or num_missing_photos:
        print(f"Found {num_incorrect_photos} incorrect and "
              f"{num_missing_photos} missing items")
    return hash_map
Exemple #4
0
def update_stored_filename_hashes(
    database: Database,
    destination: Union[str, PathLike],
    verify: bool = True,
    dry_run: bool = False,
) -> dict[str, str]:  # pragma: no cover
    """Updates filenames to match checksums.

    Run after mapping hashes to new algorithm with self.map_hashes()
    Skips files whose filename checksum matches the stored checksum.

    Note: This method is not accessed by the CLI or covered by testing
    and is intended to be used interactively, i.e. in a Jupyter notebook
    or other environment, with dry runs and spot checking of proposed
    changes before they are performed. Use at your own risk.

    :param database: the Database
    :param destination: the photo storage directory
    :param verify: if True, verify that file checksums match
    :param dry_run: if True, perform a dry run and do not move photos
    :return: the mapping of files moved
    """
    num_correct_photos = (
        num_skipped_photos) = num_incorrect_photos = num_missing_photos = 0
    destination = Path(destination).expanduser().resolve()
    stored_photos = [
        photo for photos in database.photo_db.values() for photo in photos
        if photo.sto
    ]
    total_file_size = sum(photo.fsz for photo in stored_photos)
    print(f"Updating {len(stored_photos)} filename hashes")
    print(f"Total file size: {sizeof_fmt(total_file_size)}")
    logger = logging.getLogger()
    file_map = {}
    for photo in tqdm(stored_photos):
        abs_store_path = destination / photo.sto
        new_store_path = f"{photo.sto[:32]}{photo.chk[:7]}{photo.sto[39:]}"
        new_abs_store_path = destination / new_store_path
        if new_abs_store_path.exists():
            num_skipped_photos += 1
        elif not abs_store_path.exists():
            tqdm.write(f"Missing photo: {abs_store_path}")
            num_missing_photos += 1
        elif photo.sto[32:39] == photo.chk[:7]:
            num_skipped_photos += 1
        elif (not verify or file_checksum(
                abs_store_path, database.hash_algorithm) == photo.chk):
            if logger.isEnabledFor(logging.DEBUG):
                tqdm.write(
                    f"{'Will move' if dry_run else 'Moving'} {abs_store_path} "
                    f"to {new_abs_store_path}")
            file_map[str(abs_store_path)] = str(new_abs_store_path)
            if not dry_run:
                rename(abs_store_path, new_abs_store_path)
                photo.sto = new_store_path
            num_correct_photos += 1
        else:
            tqdm.write(f"Incorrect checksum: {abs_store_path}")
            num_incorrect_photos += 1
    print(f"{'Would move' if dry_run else 'Moved'} {num_correct_photos} items")
    if num_skipped_photos:
        print(f"Skipped {num_skipped_photos} items")
    if num_incorrect_photos or num_missing_photos:
        print(f"Found {num_incorrect_photos} incorrect and "
              f"{num_missing_photos} missing items")
    return file_map
Exemple #5
0
from typing import Union, Dict
import pytest
from photomanager.hasher import AsyncFileHasher, file_checksum, HashAlgorithm

checksums = [
    (b"", "0e5751c026e543b2e8ab2eb06099daa1d1e5df47778f7787faab45cdf12fe3a8"),
    (
        b"\xff\xd8\xff\xe0",
        "7d13007a8afed521cfc13306cbd6747bbc59556e3ca9514c8d94f900fbb56230",
    ),
    (b"test", "928b20366943e2afd11ebc0eae2e53a93bf177a4fcf35bcc64d503704e65e202"),
]
for _ in range(100):
    st = bytes([random.randint(0, 255) for _ in range(1000)])
    with BytesIO(st) as fd:
        ck = file_checksum(fd, algorithm=HashAlgorithm.BLAKE2B_256)
    checksums.append((st, ck))


def test_file_hasher(tmpdir):
    files = []
    for i, (s, c) in enumerate(checksums):
        filename = tmpdir / f"{i}.bin"
        with open(filename, "wb") as f:
            f.write(s)
        files.append(filename)
    checksum_cache = AsyncFileHasher(
        algorithm=HashAlgorithm.BLAKE2B_256, use_async=False
    ).check_files(files, pbar_unit="B")
    print(checksum_cache)
    assert len(checksum_cache) == len(checksums)
Exemple #6
0
def test_file_checksum_bad_algorithm():
    with pytest.raises(HasherException):
        file_checksum("asdf.txt", algorithm="md5")
Exemple #7
0
def test_file_checksum_path(checksum, tmpdir):
    with open(tmpdir / "test.bin", "wb") as f:
        f.write(checksum["bytes"])
    assert (file_checksum(
        tmpdir / "test.bin",
        algorithm=checksum["algorithm"]) == checksum["checksum"])
Exemple #8
0
def test_file_checksum_fd(checksum):
    with BytesIO(checksum["bytes"]) as f:
        assert file_checksum(
            f, algorithm=checksum["algorithm"]) == checksum["checksum"]
        assert not f.closed