def test_copy_photos(self, datafiles, caplog): """ copy_photos will copy the supplied PhotoFiles to the destination folder using the provided relative store path, or PhotoFile.sto if no path is provided. """ caplog.set_level(logging.DEBUG) photos_to_copy = [ ( PhotoFile( chk="deadbeef", src="B/img2.jpg", sto="2015/08/2015-08-01_img2.jpg", dt="2015:08:01 18:28:36.99", ts=1438468116.99, fsz=789, tzo=-14400.0, ), None, ), ( PhotoFile( chk="deadbeef", src="B/img4.jpg", dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=777, tzo=-14400.0, ), "2018/08/2018-08-01_img4.jpg", ), ( PhotoFile( chk="deadbeef", src="B/img_missing.jpg", sto="2018/08/2018-08-08_img_missing.jpg", dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=777, tzo=-14400.0, ), None, ), ] for pf, rel_store_path in photos_to_copy: pf.src = str(datafiles / pf.src) num_copied_photos, total_copy_size, num_error_photos = fileops.copy_photos( datafiles / "dest", photos_to_copy) print(num_copied_photos, total_copy_size, num_error_photos) assert num_copied_photos == 2 assert total_copy_size == 789 + 777 assert num_error_photos == 1 assert os.listdir(datafiles / "dest/2015/08") == ["2015-08-01_img2.jpg"] assert os.listdir(datafiles / "dest/2018/08") == ["2018-08-01_img4.jpg"]
def test_photofile_from_file(datafiles): with ExifTool(): for pf in photofile_expected_results: pf = PhotoFile.from_dict(pf.to_dict()) rel_path = pf.src pf.src = str(datafiles / rel_path) new_pf = PhotoFile.from_file( pf.src, tz_default=timezone(timedelta(seconds=pf.tzo)), ) assert new_pf == pf
def generate_test_database(num_uids=10000, r_seed=42): random.seed(r_seed, version=2) database = Database() for i_uid in range(num_uids): uid = "".join(random.choices(database.UID_ALPHABET, k=8)) database.photo_db[uid] = [] for i_photo in range(random.randint(1, 3)): checksum = "".join(random.choices(string.hexdigits, k=64)) timestamp = random.randint(1037750179000000, 1637750179000000) / 1000000 dt = datetime.datetime.fromtimestamp(timestamp).astimezone( datetime.timezone(datetime.timedelta(hours=random.randint(-12, 12))) ) ts_str = dt.strftime("%Y-%m-%d %H:%M:%S%z") img_num = random.randint(0, 9999) source_path = f"/path/to/photo/{dt.year}/IMG_{img_num:04d}.JPG" store_path = ( "" if random.randint(0, 1) else f"{dt.year}/{source_path.rsplit('/', 1)[-1]}" ) filesize = random.randint(100000, 100000000) photo = PhotoFile( chk=checksum, src=source_path, ts=timestamp, dt=ts_str, fsz=filesize, sto=store_path, ) database.photo_db[uid].append(photo) return database
def test_database_add_photo_same_source_new_checksum(caplog): """ When adding a photo with a source_path in the database but a different checksum the photo is added to the database but a warning is issued. """ caplog.set_level(logging.DEBUG) db = Database.from_json(example_database_json_data2) uid = db.add_photo( PhotoFile( chk="not_a_match", src="/a/b/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=10, ), uid="uid1", ) print([(r.levelname, r) for r in caplog.records]) print(uid) assert uid == "uid1" assert db.hash_to_uid["not_a_match"] == "uid1" assert db.hash_to_uid["deadbeef"] == "uid1" print(db.photo_db["uid1"]) assert len(db.photo_db["uid1"]) == 2 print([(r.levelname, r) for r in caplog.records]) assert any(record.levelname == "WARNING" for record in caplog.records) assert any( "Checksum of previously-indexed source photo has changed" in record.msg for record in caplog.records)
def db(self, db: dict): """Set the Database parameters from a dict.""" db.setdefault("version", 1) # legacy dbs are version 1 db.setdefault("hash_algorithm", "sha256") # legacy dbs use sha256 db.setdefault("timezone_default", "local") # legacy dbs are in local time db["version"] = int(db["version"]) if db["version"] > self.VERSION: raise DatabaseException( "Database version too new for this version of PhotoManager." ) if db["version"] < 3: for uid in db["photo_db"].keys(): photos = db["photo_db"][uid] for i in range(len(photos)): photos[i] = {NAME_MAP_ENC[k]: v for k, v in photos[i].items()} db = {k: db[k] for k in self.DB_KEY_ORDER} db["hash_algorithm"] = HashAlgorithm(db["hash_algorithm"]) for uid in db["photo_db"].keys(): db["photo_db"][uid] = [PhotoFile.from_dict(d) for d in db["photo_db"][uid]] db["version"] = self.VERSION self._db = db for uid, photos in self.photo_db.items(): for photo in photos: self.hash_to_uid[photo.chk] = uid if photo.ts in self.timestamp_to_uids: self.timestamp_to_uids[photo.ts][uid] = None else: self.timestamp_to_uids[photo.ts] = {uid: None} self.reset_saved()
def test_database_add_photo_sort(caplog): caplog.set_level(logging.DEBUG) db = Database.from_json(example_database_json_data) uid = db.add_photo( PhotoFile( chk="deadbeef", src="/x/y/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=20, ), uid=None, ) db.add_photo( PhotoFile( chk="deadbeef", src="/z/y/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=11, ), uid=None, ) db.add_photo( PhotoFile( chk="deadbeef", src="/0/1/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=10, ), uid=None, ) assert list(p.src for p in db.photo_db[uid]) == [ "/0/1/c.jpg", "/a/b/c.jpg", "/z/y/c.jpg", "/x/y/c.jpg", ]
def test_remove_photos(self, datafiles, caplog): """ remove_photos will remove the supplied PhotoFiles if they are not missing """ caplog.set_level(logging.DEBUG) photos_to_remove = [ PhotoFile( chk="deadbeef", src="B/img2.jpg", dt="2015:08:01 18:28:36.99", ts=1438468116.99, fsz=789, tzo=-14400.0, ), PhotoFile( chk="deadbeef", src="B/img4.jpg", dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=777, tzo=-14400.0, ), PhotoFile( chk="deadbeef", src="B/img_missing.jpg", dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=777, tzo=-14400.0, ), ] for pf in photos_to_remove: pf.sto = str(datafiles / pf.src) num_removed_photos, num_missing_photos = fileops.remove_photos( directory=datafiles, photos=photos_to_remove) assert num_removed_photos == 2 assert num_missing_photos == 1 assert os.listdir(datafiles / "B") == ["img1.jpg"]
def test_database_add_photo_already_present(caplog): """ When adding a photo that is already in the database, the photo is not added and add_photo returns None. """ caplog.set_level(logging.DEBUG) db = Database.from_json(example_database_json_data2) uid = db.add_photo( PhotoFile( chk="deadbeef", src="/a/b/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=10, ), uid="uid1", ) print([(r.levelname, r) for r in caplog.records]) print(uid) assert uid is None
def test_database_add_photo_wrong_uid(caplog): """ When adding a photo with a matching checksum for a different uid, the photo is not added and add_photo returns None. """ caplog.set_level(logging.DEBUG) db = Database.from_json(example_database_json_data2) uid = db.add_photo( PhotoFile( chk="deadbeef", src="/x/y/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=10, ), uid="uid2", ) print([(r.levelname, r) for r in caplog.records]) print(uid) assert uid is None
def test_database_find_photo_ambiguous(caplog): """ When there is no checksum match and an ambiguous timestamp+source match, find_photo returns the first match. """ caplog.set_level(logging.DEBUG) db = Database.from_json(example_database_json_data2) uid = db.find_photo( PhotoFile( chk="not_a_match", src="/x/y/c.jpg", dt="2015:08:27 04:09:36.50", ts=1440662976.5, fsz=1024, sto="", prio=10, )) print([(r.levelname, r) for r in caplog.records]) print(uid) assert any(record.levelname == "WARNING" for record in caplog.records) assert any("ambiguous timestamp+name match" in record.msg for record in caplog.records) assert uid == "uid1"
def test_hash_stored_photos(self, datafiles, caplog): """ hash_stored_photos will return a dict of filenames:hashes given a list of stored photos. """ caplog.set_level(logging.DEBUG) stored_photos = [ PhotoFile( chk= "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb", src=str(datafiles / "A" / "img1.jpg"), dt="2015:08:01 18:28:36.90", ts=1438468116.9, fsz=771, sto="A/img1.jpg", prio=11, tzo=None, ), PhotoFile( chk= "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9", src=str(datafiles / "A" / "img1.png"), dt="2015:08:01 18:28:36.90", ts=1438468116.9, fsz=382, sto="A/img1.png", prio=11, tzo=None, ), PhotoFile( chk= "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666", src=str(datafiles / "A" / "img2.jpg"), dt="2015:08:01 18:28:36.99", ts=1438468116.99, fsz=771, sto="A/img2.jpg", prio=11, tzo=None, ), PhotoFile( chk= "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c", src=str(datafiles / "A" / "img4.jpg"), dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=759, sto="A/img4.jpg", prio=11, tzo=None, ), PhotoFile( chk= "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c", src=str(datafiles / "A" / "img_nonexistent.jpg"), dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=759, sto="A/img_nonexistent.jpg", prio=11, tzo=None, ), ] expected_hashes = { pf.src: pf.chk for pf in stored_photos if "nonexistent" not in pf.src } photos = fileops.hash_stored_photos( photos=stored_photos, directory=datafiles, hash_algorithm=HashAlgorithm.BLAKE2B_256, ) print(photos) assert photos == expected_hashes
def test_index_photos(self, datafiles, caplog): """ index_photos will make PhotoFiles for the supplied list of paths and skip nonexistent photos """ caplog.set_level(logging.DEBUG) files = [ str(datafiles / "A" / "img1.jpg"), Path(datafiles / "A" / "img1.png"), str(datafiles / "A" / "img_nonexistent.jpg"), str(datafiles / "A" / "img4.jpg"), str(datafiles / "A" / "img2.jpg"), ] expected_photos = [ PhotoFile( chk= "d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb", src=str(datafiles / "A" / "img1.jpg"), dt="2015:08:01 18:28:36.90", ts=1438468116.9, fsz=771, sto="", prio=11, tzo=-14400.0, ), PhotoFile( chk= "1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9", src=str(datafiles / "A" / "img1.png"), dt="2015:08:01 18:28:36.90", ts=1438468116.9, fsz=382, sto="", prio=11, tzo=-14400.0, ), None, PhotoFile( chk= "79ac4a89fb3d81ab1245b21b11ff7512495debca60f6abf9afbb1e1fbfe9d98c", src=str(datafiles / "A" / "img4.jpg"), dt="2018:08:01 20:28:36", ts=1533169716.0, fsz=759, sto="", prio=11, tzo=-14400.0, ), PhotoFile( chk= "3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666", src=str(datafiles / "A" / "img2.jpg"), dt="2015:08:01 18:28:36.99", ts=1438468116.99, fsz=771, sto="", prio=11, tzo=-14400.0, ), ] photos = fileops.index_photos(files=files, priority=11, tz_default=timezone( timedelta(seconds=-14400.0))) print(photos) print(len(photos)) assert photos == expected_photos
def test_database_load_version_1(): json_data = b"""{ "version": 1, "hash_algorithm": "sha256", "photo_db": { "d239210f00534b76a2b215e073f75832": [ { "checksum": "deadbeef", "source_path": "/a/b/c.jpg", "datetime": "2015:08:27 04:09:36.50", "timestamp": 1440662976.5, "file_size": 1024, "store_path": "/d/e/f.jpg", "priority": 11 }, { "checksum": "deadbeef", "source_path": "/g/b/c.jpg", "datetime": "2015:08:27 04:09:36.50", "timestamp": 1440662976.5, "file_size": 1024, "store_path": "", "priority": 20, "tz_offset": -14400 } ] }, "command_history": { "2021-03-08_23-56-00Z": "photomanager create --db test.json", "2021-03-08_23-57-00Z": "photomanager import --db test.json test.jpg" } }""" db = Database.from_json(json_data) print(db.db) assert db.version == Database.VERSION assert db.hash_algorithm == HashAlgorithm.SHA256 assert db.db["timezone_default"] == "local" assert db.timezone_default is None photo_db_expected = { "d239210f00534b76a2b215e073f75832": [ PhotoFile.from_dict({ "chk": "deadbeef", "src": "/a/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "/d/e/f.jpg", "prio": 11, }), PhotoFile.from_dict({ "chk": "deadbeef", "src": "/g/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "", "prio": 20, "tzo": -14400, }), ] } command_history_expected = { "2021-03-08_23-56-00Z": "photomanager create --db test.json", "2021-03-08_23-57-00Z": "photomanager import --db test.json test.jpg", } db_expected = { "version": Database.VERSION, "hash_algorithm": HashAlgorithm.SHA256, "timezone_default": "local", "photo_db": photo_db_expected, "command_history": command_history_expected, } assert db.photo_db == photo_db_expected assert db.command_history == command_history_expected assert orjson.loads(db.json) != orjson.loads(json_data) assert db.db == db_expected assert db == Database.from_dict(orjson.loads(json_data)) assert db.get_stats() == (1, 2, 1, 1024)
def test_database_load_version_3(): json_data = b"""{ "version": 3, "hash_algorithm": "sha256", "photo_db": { "QKEsTn2X": [ { "chk": "deadbeef", "src": "/a/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "/d/e/f.jpg", "prio": 11, "tzo": null }, { "chk": "deadbeef", "src": "/g/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "", "prio": 20, "tzo": -14400 } ] }, "command_history": { "2021-03-08_23-56-00Z": "photomanager create --db test.json", "2021-03-08_23-57-00Z": "photomanager import --db test.json test.jpg" } }""".replace(b"VERSION", f"{Database.VERSION}".encode()) db = Database.from_json(json_data) print(db.db) assert db.version == Database.VERSION assert db.hash_algorithm == HashAlgorithm.SHA256 assert db.db["timezone_default"] == "local" assert db.timezone_default is None photo_db_expected = { "QKEsTn2X": [ PhotoFile.from_dict({ "chk": "deadbeef", "src": "/a/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "/d/e/f.jpg", "prio": 11, }), PhotoFile.from_dict({ "chk": "deadbeef", "src": "/g/b/c.jpg", "dt": "2015:08:27 04:09:36.50", "ts": 1440662976.5, "fsz": 1024, "sto": "", "prio": 20, "tzo": -14400, }), ] } command_history_expected = { "2021-03-08_23-56-00Z": "photomanager create --db test.json", "2021-03-08_23-57-00Z": "photomanager import --db test.json test.jpg", } db_expected = { "version": Database.VERSION, "hash_algorithm": HashAlgorithm.SHA256, "timezone_default": "local", "photo_db": photo_db_expected, "command_history": command_history_expected, } assert db.photo_db == photo_db_expected assert db.command_history == command_history_expected assert orjson.loads(db.json) != orjson.loads(json_data) assert db.db == db_expected assert db == Database.from_dict(orjson.loads(json_data)) assert db.get_stats() == (1, 2, 1, 1024)
def index_photos( files: Iterable[Union[str, PathLike]], priority: int = 10, hash_algorithm: HashAlgorithm = DEFAULT_HASH_ALGO, tz_default: Optional[tzinfo] = None, storage_type: str = "HDD", ) -> list[Optional[PhotoFile]]: """ Indexes photo files :param files: the photo file paths to index :param priority: the photos' priority :param hash_algorithm: The hashing algorithm to use for file checksums :param tz_default: The time zone to use if none is set (defaults to local time) :param storage_type: the storage type being indexed (uses more async if SSD) :return: a list of PhotoFiles, with None entries for errors """ logger = logging.getLogger(__name__) if storage_type in ("SSD", "RAID"): async_hashes = True async_exif = cpu_count() else: # concurrent reads of sequential files can lead to thrashing async_hashes = False # exiftool is partially CPU-bound and benefits from async async_exif = min(4, cpu_count()) logger.info("Collecting media hashes") checksum_cache = AsyncFileHasher(algorithm=hash_algorithm, use_async=async_hashes).check_files( files, pbar_unit="B") logger.info("Collecting media dates and times") datetime_cache = AsyncExifTool( num_workers=async_exif).get_best_datetime_batch(files) logger.info("Indexing media") photos = [] exiftool = ExifTool() exiftool.start() for current_file in tqdm(files): if logger.isEnabledFor(logging.DEBUG): tqdm.write(f"Indexing {current_file}") try: pf = PhotoFile.from_file_cached( current_file, checksum_cache=checksum_cache, datetime_cache=datetime_cache, algorithm=hash_algorithm, tz_default=tz_default, priority=priority, ) photos.append(pf) except Exception as e: tqdm.write(f"Error indexing {current_file}", file=sys.stderr) tb_str = "".join( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) tqdm.write(tb_str, file=sys.stderr) photos.append(None) exiftool.terminate() return photos
import pytest from photomanager.pyexiftool import ExifTool from photomanager.photofile import PhotoFile FIXTURE_DIR = Path(__file__).resolve().parent.parent / "test_files" ALL_IMG_DIRS = pytest.mark.datafiles( FIXTURE_DIR / "A", FIXTURE_DIR / "B", FIXTURE_DIR / "C", keep_top_dir=True, ) photofile_expected_results = [ PhotoFile( chk="d090ce7023b57925e7e94fc80372e3434fb1897e00b4452a25930dd1b83648fb", src="A/img1.jpg", dt="2015:08:01 18:28:36.90", ts=1438468116.9, fsz=771, tzo=-14400.0, ), PhotoFile( chk="3b39f47d51f63e54c76417ee6e04c34bd3ff5ac47696824426dca9e200f03666", src="A/img2.jpg", dt="2015:08:01 18:28:36.99", ts=1438450116.99, fsz=771, tzo=3600.0, ), PhotoFile( chk="1e10df2e3abe4c810551525b6cb2eb805886de240e04cc7c13c58ae208cabfb9", src="A/img1.png", dt="2015:08:01 18:28:36.90",