def load_metadata( self, full_file_name: Optional[str] = None, raw_bytes: Optional[bytearray] = None, app1_segment: Optional[bytearray] = None, et_process: exiftool.ExifTool = None, force_exiftool: Optional[bool] = False, ) -> bool: """ Use GExiv2 or ExifTool to read the photograph's metadata. :param full_file_name: full path of file from which file to read the metadata. :param raw_bytes: portion of a non-jpeg file from which the metadata can be extracted :param app1_segment: the app1 segment of a jpeg file, from which the metadata can be read :param et_process: optional daemon ExifTool process :param force_exiftool: whether ExifTool must be used to load the metadata :return: True if successful, False otherwise """ if force_exiftool or fileformats.use_exiftool_on_photo( self.extension, preview_extraction_irrelevant=True ): self.metadata = metadataexiftool.MetadataExiftool( full_file_name=full_file_name, et_process=et_process, file_type=self.file_type, ) return True else: try: self.metadata = metadataphoto.MetaData( full_file_name=full_file_name, raw_bytes=raw_bytes, app1_segment=app1_segment, et_process=et_process, ) except GLib.GError as e: logging.warning( "Could not read metadata from %s. %s", self.full_file_name, e ) self.metadata_failure = True return False except: logging.warning("Could not read metadata from %s", self.full_file_name) self.metadata_failure = True return False else: return True
def generate_thumbnails(self) -> None: self.camera = None arguments = pickle.loads( self.content) # type: GenerateThumbnailsArguments self.device_name = arguments.name logging.info("Generating %s thumbnails for %s", len(arguments.rpd_files), arguments.name) if arguments.log_gphoto2: self.gphoto2_logging = gphoto2_python_logging() self.frontend = self.context.socket(zmq.PUSH) self.frontend.connect("tcp://localhost:{}".format( arguments.frontend_port)) self.prefs = Preferences() # Whether we must use ExifTool to read photo metadata force_exiftool = self.prefs.force_exiftool # If the entire photo or video is required to extract the thumbnail, which is # determined when extracting sample metadata from a photo or video during the # device scan entire_photo_required = arguments.entire_photo_required entire_video_required = arguments.entire_video_required # Access and generate Rapid Photo Downloader thumbnail cache use_thumbnail_cache = self.prefs.use_thumbnail_cache thumbnail_caches = GetThumbnailFromCache( use_thumbnail_cache=use_thumbnail_cache) photo_cache_dir = video_cache_dir = None cache_file_from_camera = force_exiftool rpd_files = arguments.rpd_files # with open('tests/thumbnail_data_medium_no_tiff', 'wb') as f: # pickle.dump(rpd_files, f) # Must sort files by modification time prior to temporal analysis needed to figure out # which thumbnails to prioritize rpd_files = sorted(rpd_files, key=attrgetter("modification_time")) time_span = arguments.proximity_seconds rpd_files2 = [] if rpd_files: gaps, sequences = get_temporal_gaps_and_sequences( rpd_files, time_span) rpd_files2.extend(gaps) indexes = split_indexes(len(sequences)) rpd_files2.extend([sequences[idx] for idx in indexes]) assert len(rpd_files) == len(rpd_files2) rpd_files = rpd_files2 if arguments.camera is not None: self.camera = Camera( model=arguments.camera, port=arguments.port, is_mtp_device=arguments.is_mtp_device, specific_folders=self.prefs.folders_to_scan, ) if not self.camera.camera_initialized: # There is nothing to do here: exit! logging.debug( "Prematurely exiting thumbnail generation due to lack of access to " "camera %s", arguments.camera, ) self.content = pickle.dumps( GenerateThumbnailsResults( scan_id=arguments.scan_id, camera_removed=True, ), pickle.HIGHEST_PROTOCOL, ) self.send_message_to_sink() self.disconnect_logging() self.send_finished_command() sys.exit(0) if not cache_file_from_camera: for rpd_file in rpd_files: if use_exiftool_on_photo( rpd_file.extension, preview_extraction_irrelevant=False): cache_file_from_camera = True break must_make_cache_dirs = (not self.camera.can_fetch_thumbnails or cache_file_from_camera) if (must_make_cache_dirs or arguments.need_video_cache_dir or arguments.need_photo_cache_dir): # If downloading complete copy of the files to # generate previews, then may as well cache them to speed up # the download process self.photo_cache_dir = create_temp_dir( folder=arguments.cache_dirs.photo_cache_dir, prefix=cache_dir_name(self.device_name), ) self.video_cache_dir = create_temp_dir( folder=arguments.cache_dirs.video_cache_dir, prefix=cache_dir_name(self.device_name), ) cache_dirs = CacheDirs(self.photo_cache_dir, self.video_cache_dir) self.content = pickle.dumps( GenerateThumbnailsResults(scan_id=arguments.scan_id, cache_dirs=cache_dirs), pickle.HIGHEST_PROTOCOL, ) self.send_message_to_sink() from_thumb_cache = 0 from_fdo_cache = 0 if self.camera: rescan = RescanCamera(camera=self.camera, prefs=self.prefs) rescan.rescan_camera(rpd_files) rpd_files = rescan.rpd_files if rescan.missing_rpd_files: logging.error( "%s files could not be relocated on %s", len(rescan.missing_rpd_files), self.camera.display_name, ) for rpd_file in rescan.missing_rpd_files: # type: RPDFile self.content = pickle.dumps( GenerateThumbnailsResults(rpd_file=rpd_file, thumbnail_bytes=None), pickle.HIGHEST_PROTOCOL, ) self.send_message_to_sink() for rpd_file in rpd_files: # type: RPDFile # Check to see if the process has received a command self.check_for_controller_directive() exif_buffer = None file_to_work_on_is_temporary = False secondary_full_file_name = "" processing = set() # type: Set[ExtractionProcessing] # Attempt to get thumbnail from Thumbnail Cache # (see cache.py for definitions of various caches) cache_search = thumbnail_caches.get_from_cache(rpd_file) task, thumbnail_bytes, full_file_name_to_work_on, origin = cache_search if task != ExtractionTask.undetermined: if origin == ThumbnailCacheOrigin.thumbnail_cache: from_thumb_cache += 1 else: assert origin == ThumbnailCacheOrigin.fdo_cache logging.debug( "Thumbnail for %s found in large FDO cache", rpd_file.full_file_name, ) from_fdo_cache += 1 processing.add(ExtractionProcessing.resize) if not rpd_file.mdatatime: # Since we're extracting the thumbnail from the FDO cache, # need to grab its metadata too. # Reassign the task task = ExtractionTask.load_file_directly_metadata_from_secondary # It's not being downloaded from a camera, so nothing # special to do except assign the name of the file from which # to extract the metadata secondary_full_file_name = rpd_file.full_file_name logging.debug( "Although thumbnail found in the cache, tasked to extract " "file time recorded in metadata from %s", secondary_full_file_name, ) if task == ExtractionTask.undetermined: # Thumbnail was not found in any cache: extract it if self.camera: # type: Camera if rpd_file.file_type == FileType.photo: if rpd_file.is_heif(): # Load HEIF / HEIC using entire file. # We are assuming that there is no tool to extract a # preview image from an HEIF / HEIC, or the file simply # does not have one to extract. if self.cache_full_size_file_from_camera(rpd_file): task = ExtractionTask.load_heif_and_exif_directly processing.add(ExtractionProcessing.resize) full_file_name_to_work_on = ( rpd_file.cache_full_file_name) # For now, do not orient, as it seems pyheif or libheif does # that automatically. # processing.add(ExtractionProcessing.orient) elif self.camera.can_fetch_thumbnails: task = ExtractionTask.load_from_bytes if rpd_file.is_jpeg_type(): # gPhoto2 knows how to get jpeg thumbnails try: thumbnail_bytes = self.camera.get_thumbnail( rpd_file.path, rpd_file.name) except CameraProblemEx as e: # TODO handle error? thumbnail_bytes = None else: if force_exiftool or use_exiftool_on_photo( rpd_file.extension, preview_extraction_irrelevant=False, ): ( task, full_file_name_to_work_on, file_to_work_on_is_temporary, ) = self.extract_photo_video_from_camera( rpd_file, entire_photo_required, full_file_name_to_work_on, True, ) if (task == ExtractionTask. load_from_bytes_metadata_from_temp_extract ): secondary_full_file_name = ( full_file_name_to_work_on) file_to_work_on_is_temporary = False else: # gPhoto2 does not know how to get RAW thumbnails, # so we do that part ourselves if rpd_file.extension == "crw": # Could cache this file, since reading its # entirety But does anyone download a CRW file # from the camera these days?! bytes_to_read = rpd_file.size else: bytes_to_read = min( rpd_file.size, orientation_offset.get( rpd_file.extension, 500), ) exif_buffer = self.camera.get_exif_extract( rpd_file.path, rpd_file.name, bytes_to_read) try: thumbnail_bytes = self.camera.get_thumbnail( rpd_file.path, rpd_file.name) except CameraProblemEx as e: # TODO report error thumbnail_bytes = None processing.add( ExtractionProcessing.strip_bars_photo) processing.add(ExtractionProcessing.orient) else: # Many (all?) jpegs from phones don't include jpeg previews, # so need to render from the entire jpeg itself. Slow! # For raw, extract merely a part of phone's raw format, and # try to extract the jpeg preview from it (which probably # doesn't exist!). This is fast. if not rpd_file.is_jpeg(): bytes_to_read = thumbnail_offset.get( rpd_file.extension) if bytes_to_read: exif_buffer = self.camera.get_exif_extract( rpd_file.path, rpd_file.name, bytes_to_read) task = ExtractionTask.load_from_exif_buffer processing.add(ExtractionProcessing.orient) if (task == ExtractionTask.undetermined and self.cache_full_size_file_from_camera( rpd_file)): if rpd_file.is_jpeg(): task = ExtractionTask.load_file_and_exif_directly processing.add(ExtractionProcessing.resize) processing.add(ExtractionProcessing.orient) else: task = ExtractionTask.load_from_exif processing.add(ExtractionProcessing.resize) processing.add(ExtractionProcessing.orient) full_file_name_to_work_on = ( rpd_file.cache_full_file_name) else: # Failed to generate thumbnail task = ExtractionTask.bypass else: # video from camera if rpd_file.thm_full_name is not None: # Fortunately, we have a special video thumbnail file # Still need to get metadata time, however. if entire_video_required: offset = rpd_file.size else: offset = datetime_offset.get( rpd_file.extension) # If there is no offset, there is no point trying to # extract the metadata time from part of the video. It's # not ideal, but if this is from a camera on which there # were any other files we can assume we've got a # somewhat accurate date time for it from the # modification time. The only exception is if the video # file is not that big, in which case it's worth reading # in its entirety: if offset is None and rpd_file.size < 4000000: offset = rpd_file.size if rpd_file.mdatatime or not offset: task = ExtractionTask.load_from_bytes elif self.cache_file_chunk_from_camera( rpd_file, offset): task = ( ExtractionTask. load_from_bytes_metadata_from_temp_extract) secondary_full_file_name = ( rpd_file.temp_cache_full_file_chunk) else: # For some reason was unable to download part of the # video file task = ExtractionTask.load_from_bytes try: thumbnail_bytes = self.camera.get_THM_file( rpd_file.thm_full_name) except CameraProblemEx as e: # TODO report error thumbnail_bytes = None processing.add( ExtractionProcessing.strip_bars_video) processing.add(ExtractionProcessing.add_film_strip) else: ( task, full_file_name_to_work_on, file_to_work_on_is_temporary, ) = self.extract_photo_video_from_camera( rpd_file, entire_video_required, full_file_name_to_work_on, False, ) else: # File is not on a camera task = preprocess_thumbnail_from_disk( rpd_file=rpd_file, processing=processing) if task != ExtractionTask.bypass: if rpd_file.thm_full_name is not None: full_file_name_to_work_on = rpd_file.thm_full_name if (task == ExtractionTask. load_file_directly_metadata_from_secondary ): secondary_full_file_name = rpd_file.full_file_name else: full_file_name_to_work_on = rpd_file.full_file_name if task == ExtractionTask.bypass: self.content = pickle.dumps( GenerateThumbnailsResults(rpd_file=rpd_file, thumbnail_bytes=thumbnail_bytes), pickle.HIGHEST_PROTOCOL, ) self.send_message_to_sink() elif task != ExtractionTask.undetermined: # Send data to load balancer, which will send to one of its # workers self.content = pickle.dumps( ThumbnailExtractorArgument( rpd_file=rpd_file, task=task, processing=processing, full_file_name_to_work_on=full_file_name_to_work_on, secondary_full_file_name=secondary_full_file_name, exif_buffer=exif_buffer, thumbnail_bytes=thumbnail_bytes, use_thumbnail_cache=use_thumbnail_cache, file_to_work_on_is_temporary= file_to_work_on_is_temporary, write_fdo_thumbnail=False, send_thumb_to_main=True, force_exiftool=force_exiftool, ), pickle.HIGHEST_PROTOCOL, ) self.frontend.send_multipart([b"data", self.content]) if arguments.camera: self.camera.free_camera() # Delete our temporary cache directories if they are empty if photo_cache_dir is not None: if not os.listdir(self.photo_cache_dir): os.rmdir(self.photo_cache_dir) if video_cache_dir is not None: if not os.listdir(self.video_cache_dir): os.rmdir(self.video_cache_dir) logging.debug("Finished phase 1 of thumbnail generation for %s", self.device_name) if from_thumb_cache: logging.info( "{} of {} thumbnails for {} came from thumbnail cache".format( from_thumb_cache, len(rpd_files), self.device_name)) if from_fdo_cache: logging.info( "{} of {} thumbnails of for {} came from Free Desktop cache". format(from_fdo_cache, len(rpd_files), self.device_name)) self.disconnect_logging() self.send_finished_command()
def scan( folder: str, disk_cach_cleared: bool, scan_types: List[str], errors: bool, outfile: str, keep_file_names: bool, analyze_previews: bool, ) -> Tuple[List[PhotoAttributes], List[VideoAttributes]]: global stop global kill problematic_files = "RAW_LEICA_M8.DNG" stop = kill = False pbs = progress_bar_scanning() pbs.start() test_files = [] not_tested = [] # Phase 1 # Determine which files are safe to test i.e. are not cached if analyze_previews: disk_cach_cleared = True for dir_name, subdirs, filenames in walk(folder): for filename in filenames: if filename not in problematic_files: ext = extract_extension(filename) if ext in scan_types: full_file_name = os.path.join(dir_name, filename) if disk_cach_cleared: test_files.append((full_file_name, ext.upper())) else: bytes_cached, total, in_memory = vmtouch_output( full_file_name) if bytes_cached == 0: test_files.append((full_file_name, ext.upper())) else: not_tested.append(full_file_name) stop = True pbs.join() if not_tested: print() if len(not_tested) > 20: for line in textwrap.wrap( "WARNING: {:,} files will not be analyzed because they are already in the " "kernel disk cache.".format(len(not_tested)), width=80, ): print(line) else: print( "WARNING: these files will not be analyzed because they are already in the " "kernel disk cache:") for name in not_tested: print(name) print() for line in textwrap.wrap( "Run this script as super user and use command line option -c or --clear to safely " "clear the disk cache.", width=80, ): print(line) if confirm(prompt="\nDo you want to exit?", resp=True): sys.exit(0) photos = [] videos = [] if test_files: print("\nAnalyzing {:,} files:".format(len(test_files))) if have_progressbar and not errors: bar = pyprind.ProgBar(iterations=len(test_files), stream=1, track_time=False, width=80) else: print("\nNothing to analyze") # Phase 2 # Get info from files if errors: context = show_errors() else: # Redirect stderr, hiding error output from exiv2 context = stdchannel_redirected(sys.stderr, os.devnull) metadata_fail = [] with context: with ExifTool() as exiftool_process: for full_file_name, ext in test_files: if ext.lower() in VIDEO_EXTENSIONS: va = VideoAttributes(full_file_name, ext, exiftool_process) videos.append(va) else: # TODO think about how to handle HEIF files! if use_exiftool_on_photo( ext.lower(), preview_extraction_irrelevant=False): pa = ExifToolPhotoAttributes(full_file_name, ext, exiftool_process, analyze_previews) pa.process(analyze_previews) photos.append(pa) else: try: metadata = mp.MetaData( full_file_name=full_file_name, et_process=exiftool_process, ) except: metadata_fail.append(full_file_name) else: pa = PhotoAttributes(full_file_name, ext, exiftool_process, analyze_previews) pa.metadata = metadata pa.process(analyze_previews) photos.append(pa) if have_progressbar and not errors: bar.update() if metadata_fail: print() for full_file_name in metadata_fail: print("Could not read metadata from {}".format(full_file_name)) if outfile is not None: if not keep_file_names: for pa in photos: pa.file_name = None for va in videos: va.file_name = None with open(outfile, "wb") as save_to: pickle.dump((photos, videos), save_to, pickle.HIGHEST_PROTOCOL) return photos, videos