async def save_file( db_client: DBClient, namespace: Namespace, path: StrOrPath, content: IO[bytes], ) -> File: """ Save file to storage and database. If file name is already taken, then file will be saved under a new name. For example - if target name 'f.txt' is taken, then new name will be 'f (1).txt'. Args: db_client (DBClient): Database client. namespace (Namespace): Namespace where a file should be saved. path (StrOrPath): Path where a file will be saved. content (IO): Actual file. Raises: NotADirectory: If one of the path parents is not a folder. Returns: File: Saved file. """ parent = os.path.normpath(os.path.dirname(path)) if not await crud.file.exists(db_client, namespace.path, parent): await create_folder(db_client, namespace, parent) next_path = await crud.file.next_path(db_client, namespace.path, path) storage_file = await storage.save(namespace.path, next_path, content) mediatype = mediatypes.guess(next_path, content) dhash = hashes.dhash(content, mediatype=mediatype) async for tx in db_client.transaction(): # pragma: no branch async with tx: file = await crud.file.create( tx, namespace.path, next_path, size=storage_file.size, mediatype=mediatype, ) if dhash is not None: await crud.fingerprint.create( tx, file.id, fp=dhash, ) return file
async def __call__( self, ns_path: StrOrPath, path: StrOrPath = None, content: bytes | BytesIO = b"I'm Dummy File!", ) -> File: path = path or fake.unique.file_name(category="text", extension="txt") parent = os.path.normpath(os.path.dirname(path)) await storage.makedirs(ns_path, parent) if not await crud.file.exists(self._db_conn, ns_path, parent): await crud.file.create_folder(self._db_conn, ns_path, parent) if isinstance(content, bytes): content = BytesIO(content) file = await storage.save(ns_path, path, content=content) return await crud.file.create(self._db_conn, ns_path, path, size=file.size, mediatype=mediatypes.guess( path, content))
def test_guess_based_on_file_content(): jpeg_header = b'\xff\xd8\xff\xe0\x00\x10' assert mediatypes.guess("image", file=jpeg_header) == "image/jpeg"
def test_guess_but_filename_does_not_have_suffix(): assert mediatypes.guess("f") == mediatypes.OCTET_STREAM
def test_guess_based_on_file_content_with_fallback_to_filename(): assert mediatypes.guess("f.txt", file=b"dummy") == "text/plain"
def test_guess_based_on_filename(): assert mediatypes.guess("f.txt") == "text/plain"
async def reconcile(db_client: DBClient, namespace: Namespace) -> None: """ Create files that are missing in the database, but present in the storage and remove files that are present in the database, but missing in the storage. Args: db_client (DBClient): Database client. namespace (Namespace): Namespace where file will be reconciled. Raises: errors.FileNotFound: If path to a folder does not exists. errors.NotADirectory: If path to a folder is not a directory. """ ns_path = str(namespace.path) folders = deque(["."]) missing = [] to_fingerprint = [] # For now, it is faster to re-create all files from scratch # than iterating through large directories looking for one missing/dangling file await crud.file.delete_all(db_client, ns_path) await crud.file.create_home_folder(db_client, ns_path) while True: try: folder = folders.pop() except IndexError: break for file in await storage.iterdir(ns_path, folder): if file.is_dir(): folders.append(file.path) size = 0 mediatype = mediatypes.FOLDER else: size = file.size mediatype = mediatypes.guess(file.name) if mediatypes.is_image(mediatype): to_fingerprint.append(file.path) missing.append( File( id=None, # type: ignore name=file.name, path=file.path, size=size, mtime=file.mtime, mediatype=mediatype, )) mediatype_names = set(file.mediatype for file in missing) await crud.mediatype.create_missing(db_client, names=mediatype_names) chunk_size = min(len(missing), 500) await asyncio.gather( *(crud.file.create_batch(db_client, ns_path, files=chunk) for chunk in itertools.zip_longest(*[iter(missing)] * chunk_size))) await crud.file.restore_all_folders_size(db_client, ns_path) loop = asyncio.get_running_loop() with concurrent.futures.ProcessPoolExecutor() as executor: tasks = [ loop.run_in_executor(executor, _reconcile_calc_fp, storage, ns_path, path) for path in to_fingerprint ] fingerprints = await asyncio.gather(*tasks) chunk_size = min(len(to_fingerprint), 500) chunks = [zip(to_fingerprint, fingerprints)] * chunk_size await asyncio.gather(*(crud.fingerprint.create_batch( db_client, namespace=ns_path, fingerprints=chunk) for chunk in itertools.zip_longest(*chunks)))