Ejemplo n.º 1
0
    def load_metadata(
        self,
        full_file_name: Optional[str] = None,
        raw_bytes: Optional[bytearray] = None,
        app1_segment: Optional[bytearray] = None,
        et_process: exiftool.ExifTool = None,
        force_exiftool: Optional[bool] = False,
    ) -> bool:
        """
        Use GExiv2 or ExifTool to read the photograph's metadata.

        :param full_file_name: full path of file from which file to read
         the metadata.
        :param raw_bytes: portion of a non-jpeg file from which the
         metadata can be extracted
        :param app1_segment: the app1 segment of a jpeg file, from which
         the metadata can be read
        :param et_process: optional daemon ExifTool process
        :param force_exiftool: whether ExifTool must be used to load the
         metadata
        :return: True if successful, False otherwise
        """

        if force_exiftool or fileformats.use_exiftool_on_photo(
            self.extension, preview_extraction_irrelevant=True
        ):

            self.metadata = metadataexiftool.MetadataExiftool(
                full_file_name=full_file_name,
                et_process=et_process,
                file_type=self.file_type,
            )
            return True
        else:
            try:
                self.metadata = metadataphoto.MetaData(
                    full_file_name=full_file_name,
                    raw_bytes=raw_bytes,
                    app1_segment=app1_segment,
                    et_process=et_process,
                )
            except GLib.GError as e:
                logging.warning(
                    "Could not read metadata from %s. %s", self.full_file_name, e
                )
                self.metadata_failure = True
                return False
            except:
                logging.warning("Could not read metadata from %s", self.full_file_name)
                self.metadata_failure = True
                return False
            else:
                return True
    def generate_thumbnails(self) -> None:
        self.camera = None
        arguments = pickle.loads(
            self.content)  # type: GenerateThumbnailsArguments
        self.device_name = arguments.name
        logging.info("Generating %s thumbnails for %s",
                     len(arguments.rpd_files), arguments.name)
        if arguments.log_gphoto2:
            self.gphoto2_logging = gphoto2_python_logging()

        self.frontend = self.context.socket(zmq.PUSH)
        self.frontend.connect("tcp://localhost:{}".format(
            arguments.frontend_port))

        self.prefs = Preferences()

        # Whether we must use ExifTool to read photo metadata
        force_exiftool = self.prefs.force_exiftool

        # If the entire photo or video is required to extract the thumbnail, which is
        # determined when extracting sample metadata from a photo or video during the
        # device scan
        entire_photo_required = arguments.entire_photo_required
        entire_video_required = arguments.entire_video_required

        # Access and generate Rapid Photo Downloader thumbnail cache
        use_thumbnail_cache = self.prefs.use_thumbnail_cache

        thumbnail_caches = GetThumbnailFromCache(
            use_thumbnail_cache=use_thumbnail_cache)

        photo_cache_dir = video_cache_dir = None
        cache_file_from_camera = force_exiftool

        rpd_files = arguments.rpd_files

        # with open('tests/thumbnail_data_medium_no_tiff', 'wb') as f:
        #     pickle.dump(rpd_files, f)

        # Must sort files by modification time prior to temporal analysis needed to figure out
        # which thumbnails to prioritize
        rpd_files = sorted(rpd_files, key=attrgetter("modification_time"))

        time_span = arguments.proximity_seconds

        rpd_files2 = []

        if rpd_files:
            gaps, sequences = get_temporal_gaps_and_sequences(
                rpd_files, time_span)

            rpd_files2.extend(gaps)

            indexes = split_indexes(len(sequences))
            rpd_files2.extend([sequences[idx] for idx in indexes])

        assert len(rpd_files) == len(rpd_files2)
        rpd_files = rpd_files2

        if arguments.camera is not None:
            self.camera = Camera(
                model=arguments.camera,
                port=arguments.port,
                is_mtp_device=arguments.is_mtp_device,
                specific_folders=self.prefs.folders_to_scan,
            )

            if not self.camera.camera_initialized:
                # There is nothing to do here: exit!
                logging.debug(
                    "Prematurely exiting thumbnail generation due to lack of access to "
                    "camera %s",
                    arguments.camera,
                )
                self.content = pickle.dumps(
                    GenerateThumbnailsResults(
                        scan_id=arguments.scan_id,
                        camera_removed=True,
                    ),
                    pickle.HIGHEST_PROTOCOL,
                )
                self.send_message_to_sink()
                self.disconnect_logging()
                self.send_finished_command()
                sys.exit(0)

            if not cache_file_from_camera:
                for rpd_file in rpd_files:
                    if use_exiftool_on_photo(
                            rpd_file.extension,
                            preview_extraction_irrelevant=False):
                        cache_file_from_camera = True
                        break

            must_make_cache_dirs = (not self.camera.can_fetch_thumbnails
                                    or cache_file_from_camera)

            if (must_make_cache_dirs or arguments.need_video_cache_dir
                    or arguments.need_photo_cache_dir):
                # If downloading complete copy of the files to
                # generate previews, then may as well cache them to speed up
                # the download process
                self.photo_cache_dir = create_temp_dir(
                    folder=arguments.cache_dirs.photo_cache_dir,
                    prefix=cache_dir_name(self.device_name),
                )
                self.video_cache_dir = create_temp_dir(
                    folder=arguments.cache_dirs.video_cache_dir,
                    prefix=cache_dir_name(self.device_name),
                )
                cache_dirs = CacheDirs(self.photo_cache_dir,
                                       self.video_cache_dir)
                self.content = pickle.dumps(
                    GenerateThumbnailsResults(scan_id=arguments.scan_id,
                                              cache_dirs=cache_dirs),
                    pickle.HIGHEST_PROTOCOL,
                )
                self.send_message_to_sink()

        from_thumb_cache = 0
        from_fdo_cache = 0

        if self.camera:
            rescan = RescanCamera(camera=self.camera, prefs=self.prefs)
            rescan.rescan_camera(rpd_files)
            rpd_files = rescan.rpd_files
            if rescan.missing_rpd_files:
                logging.error(
                    "%s files could not be relocated on %s",
                    len(rescan.missing_rpd_files),
                    self.camera.display_name,
                )
                for rpd_file in rescan.missing_rpd_files:  # type: RPDFile
                    self.content = pickle.dumps(
                        GenerateThumbnailsResults(rpd_file=rpd_file,
                                                  thumbnail_bytes=None),
                        pickle.HIGHEST_PROTOCOL,
                    )
                    self.send_message_to_sink()

        for rpd_file in rpd_files:  # type: RPDFile
            # Check to see if the process has received a command
            self.check_for_controller_directive()

            exif_buffer = None
            file_to_work_on_is_temporary = False
            secondary_full_file_name = ""
            processing = set()  # type: Set[ExtractionProcessing]

            # Attempt to get thumbnail from Thumbnail Cache
            # (see cache.py for definitions of various caches)

            cache_search = thumbnail_caches.get_from_cache(rpd_file)
            task, thumbnail_bytes, full_file_name_to_work_on, origin = cache_search
            if task != ExtractionTask.undetermined:
                if origin == ThumbnailCacheOrigin.thumbnail_cache:
                    from_thumb_cache += 1
                else:
                    assert origin == ThumbnailCacheOrigin.fdo_cache
                    logging.debug(
                        "Thumbnail for %s found in large FDO cache",
                        rpd_file.full_file_name,
                    )
                    from_fdo_cache += 1
                    processing.add(ExtractionProcessing.resize)
                    if not rpd_file.mdatatime:
                        # Since we're extracting the thumbnail from the FDO cache,
                        # need to grab its metadata too.
                        # Reassign the task
                        task = ExtractionTask.load_file_directly_metadata_from_secondary
                        # It's not being downloaded from a camera, so nothing
                        # special to do except assign the name of the file from which
                        # to extract the metadata
                        secondary_full_file_name = rpd_file.full_file_name
                        logging.debug(
                            "Although thumbnail found in the cache, tasked to extract "
                            "file time recorded in metadata from %s",
                            secondary_full_file_name,
                        )
            if task == ExtractionTask.undetermined:
                # Thumbnail was not found in any cache: extract it
                if self.camera:  # type: Camera
                    if rpd_file.file_type == FileType.photo:
                        if rpd_file.is_heif():
                            # Load HEIF / HEIC using entire file.
                            # We are assuming that there is no tool to extract a
                            # preview image from an HEIF / HEIC, or the file simply
                            # does not have one to extract.
                            if self.cache_full_size_file_from_camera(rpd_file):
                                task = ExtractionTask.load_heif_and_exif_directly
                                processing.add(ExtractionProcessing.resize)
                                full_file_name_to_work_on = (
                                    rpd_file.cache_full_file_name)
                                # For now, do not orient, as it seems pyheif or libheif does
                                # that automatically.
                                # processing.add(ExtractionProcessing.orient)

                        elif self.camera.can_fetch_thumbnails:
                            task = ExtractionTask.load_from_bytes
                            if rpd_file.is_jpeg_type():
                                # gPhoto2 knows how to get jpeg thumbnails
                                try:
                                    thumbnail_bytes = self.camera.get_thumbnail(
                                        rpd_file.path, rpd_file.name)
                                except CameraProblemEx as e:
                                    # TODO handle error?
                                    thumbnail_bytes = None
                            else:

                                if force_exiftool or use_exiftool_on_photo(
                                        rpd_file.extension,
                                        preview_extraction_irrelevant=False,
                                ):
                                    (
                                        task,
                                        full_file_name_to_work_on,
                                        file_to_work_on_is_temporary,
                                    ) = self.extract_photo_video_from_camera(
                                        rpd_file,
                                        entire_photo_required,
                                        full_file_name_to_work_on,
                                        True,
                                    )
                                    if (task == ExtractionTask.
                                            load_from_bytes_metadata_from_temp_extract
                                        ):
                                        secondary_full_file_name = (
                                            full_file_name_to_work_on)
                                        file_to_work_on_is_temporary = False

                                else:
                                    # gPhoto2 does not know how to get RAW thumbnails,
                                    # so we do that part ourselves
                                    if rpd_file.extension == "crw":
                                        # Could cache this file, since reading its
                                        # entirety But does anyone download a CRW file
                                        # from the camera these days?!
                                        bytes_to_read = rpd_file.size
                                    else:
                                        bytes_to_read = min(
                                            rpd_file.size,
                                            orientation_offset.get(
                                                rpd_file.extension, 500),
                                        )
                                    exif_buffer = self.camera.get_exif_extract(
                                        rpd_file.path, rpd_file.name,
                                        bytes_to_read)
                                try:
                                    thumbnail_bytes = self.camera.get_thumbnail(
                                        rpd_file.path, rpd_file.name)
                                except CameraProblemEx as e:
                                    # TODO report error
                                    thumbnail_bytes = None
                            processing.add(
                                ExtractionProcessing.strip_bars_photo)
                            processing.add(ExtractionProcessing.orient)
                        else:
                            # Many (all?) jpegs from phones don't include jpeg previews,
                            # so need to render from the entire jpeg itself. Slow!

                            # For raw, extract merely a part of phone's raw format, and
                            # try to extract the jpeg preview from it (which probably
                            # doesn't exist!). This is fast.

                            if not rpd_file.is_jpeg():
                                bytes_to_read = thumbnail_offset.get(
                                    rpd_file.extension)
                                if bytes_to_read:
                                    exif_buffer = self.camera.get_exif_extract(
                                        rpd_file.path, rpd_file.name,
                                        bytes_to_read)
                                    task = ExtractionTask.load_from_exif_buffer
                                    processing.add(ExtractionProcessing.orient)
                            if (task == ExtractionTask.undetermined
                                    and self.cache_full_size_file_from_camera(
                                        rpd_file)):
                                if rpd_file.is_jpeg():
                                    task = ExtractionTask.load_file_and_exif_directly
                                    processing.add(ExtractionProcessing.resize)
                                    processing.add(ExtractionProcessing.orient)
                                else:
                                    task = ExtractionTask.load_from_exif
                                    processing.add(ExtractionProcessing.resize)
                                    processing.add(ExtractionProcessing.orient)
                                full_file_name_to_work_on = (
                                    rpd_file.cache_full_file_name)
                            else:
                                # Failed to generate thumbnail
                                task = ExtractionTask.bypass
                    else:
                        # video from camera
                        if rpd_file.thm_full_name is not None:
                            # Fortunately, we have a special video thumbnail file
                            # Still need to get metadata time, however.

                            if entire_video_required:
                                offset = rpd_file.size
                            else:
                                offset = datetime_offset.get(
                                    rpd_file.extension)
                                # If there is no offset, there is no point trying to
                                # extract the metadata time from part of the video. It's
                                # not ideal, but if this is from a camera on which there
                                # were any other files we can assume we've got a
                                # somewhat accurate date time for it from the
                                # modification time. The only exception is if the video
                                # file is not that big, in which case it's worth reading
                                # in its entirety:
                                if offset is None and rpd_file.size < 4000000:
                                    offset = rpd_file.size

                            if rpd_file.mdatatime or not offset:
                                task = ExtractionTask.load_from_bytes
                            elif self.cache_file_chunk_from_camera(
                                    rpd_file, offset):
                                task = (
                                    ExtractionTask.
                                    load_from_bytes_metadata_from_temp_extract)
                                secondary_full_file_name = (
                                    rpd_file.temp_cache_full_file_chunk)
                            else:
                                # For some reason was unable to download part of the
                                # video file
                                task = ExtractionTask.load_from_bytes

                            try:
                                thumbnail_bytes = self.camera.get_THM_file(
                                    rpd_file.thm_full_name)
                            except CameraProblemEx as e:
                                # TODO report error
                                thumbnail_bytes = None
                            processing.add(
                                ExtractionProcessing.strip_bars_video)
                            processing.add(ExtractionProcessing.add_film_strip)
                        else:
                            (
                                task,
                                full_file_name_to_work_on,
                                file_to_work_on_is_temporary,
                            ) = self.extract_photo_video_from_camera(
                                rpd_file,
                                entire_video_required,
                                full_file_name_to_work_on,
                                False,
                            )
                else:
                    # File is not on a camera
                    task = preprocess_thumbnail_from_disk(
                        rpd_file=rpd_file, processing=processing)
                    if task != ExtractionTask.bypass:
                        if rpd_file.thm_full_name is not None:
                            full_file_name_to_work_on = rpd_file.thm_full_name
                            if (task == ExtractionTask.
                                    load_file_directly_metadata_from_secondary
                                ):
                                secondary_full_file_name = rpd_file.full_file_name
                        else:
                            full_file_name_to_work_on = rpd_file.full_file_name

            if task == ExtractionTask.bypass:
                self.content = pickle.dumps(
                    GenerateThumbnailsResults(rpd_file=rpd_file,
                                              thumbnail_bytes=thumbnail_bytes),
                    pickle.HIGHEST_PROTOCOL,
                )
                self.send_message_to_sink()

            elif task != ExtractionTask.undetermined:
                # Send data to load balancer, which will send to one of its
                # workers

                self.content = pickle.dumps(
                    ThumbnailExtractorArgument(
                        rpd_file=rpd_file,
                        task=task,
                        processing=processing,
                        full_file_name_to_work_on=full_file_name_to_work_on,
                        secondary_full_file_name=secondary_full_file_name,
                        exif_buffer=exif_buffer,
                        thumbnail_bytes=thumbnail_bytes,
                        use_thumbnail_cache=use_thumbnail_cache,
                        file_to_work_on_is_temporary=
                        file_to_work_on_is_temporary,
                        write_fdo_thumbnail=False,
                        send_thumb_to_main=True,
                        force_exiftool=force_exiftool,
                    ),
                    pickle.HIGHEST_PROTOCOL,
                )
                self.frontend.send_multipart([b"data", self.content])

        if arguments.camera:
            self.camera.free_camera()
            # Delete our temporary cache directories if they are empty
            if photo_cache_dir is not None:
                if not os.listdir(self.photo_cache_dir):
                    os.rmdir(self.photo_cache_dir)
            if video_cache_dir is not None:
                if not os.listdir(self.video_cache_dir):
                    os.rmdir(self.video_cache_dir)

        logging.debug("Finished phase 1 of thumbnail generation for %s",
                      self.device_name)
        if from_thumb_cache:
            logging.info(
                "{} of {} thumbnails for {} came from thumbnail cache".format(
                    from_thumb_cache, len(rpd_files), self.device_name))
        if from_fdo_cache:
            logging.info(
                "{} of {} thumbnails of for {} came from Free Desktop cache".
                format(from_fdo_cache, len(rpd_files), self.device_name))

        self.disconnect_logging()
        self.send_finished_command()
def scan(
    folder: str,
    disk_cach_cleared: bool,
    scan_types: List[str],
    errors: bool,
    outfile: str,
    keep_file_names: bool,
    analyze_previews: bool,
) -> Tuple[List[PhotoAttributes], List[VideoAttributes]]:

    global stop
    global kill

    problematic_files = "RAW_LEICA_M8.DNG"

    stop = kill = False

    pbs = progress_bar_scanning()
    pbs.start()

    test_files = []
    not_tested = []
    # Phase 1
    # Determine which files are safe to test i.e. are not cached

    if analyze_previews:
        disk_cach_cleared = True

    for dir_name, subdirs, filenames in walk(folder):
        for filename in filenames:
            if filename not in problematic_files:
                ext = extract_extension(filename)
                if ext in scan_types:
                    full_file_name = os.path.join(dir_name, filename)

                    if disk_cach_cleared:
                        test_files.append((full_file_name, ext.upper()))
                    else:
                        bytes_cached, total, in_memory = vmtouch_output(
                            full_file_name)
                        if bytes_cached == 0:
                            test_files.append((full_file_name, ext.upper()))
                        else:
                            not_tested.append(full_file_name)

    stop = True
    pbs.join()

    if not_tested:
        print()
        if len(not_tested) > 20:
            for line in textwrap.wrap(
                    "WARNING: {:,} files will not be analyzed because they are already in the "
                    "kernel disk cache.".format(len(not_tested)),
                    width=80,
            ):
                print(line)
        else:
            print(
                "WARNING: these files will not be analyzed because they are already in the "
                "kernel disk cache:")
            for name in not_tested:
                print(name)
        print()
        for line in textwrap.wrap(
                "Run this script as super user and use command line option -c or --clear to safely "
                "clear the disk cache.",
                width=80,
        ):
            print(line)

        if confirm(prompt="\nDo you want to exit?", resp=True):
            sys.exit(0)

    photos = []
    videos = []

    if test_files:
        print("\nAnalyzing {:,} files:".format(len(test_files)))
        if have_progressbar and not errors:
            bar = pyprind.ProgBar(iterations=len(test_files),
                                  stream=1,
                                  track_time=False,
                                  width=80)
    else:
        print("\nNothing to analyze")

    # Phase 2
    # Get info from files

    if errors:
        context = show_errors()
    else:
        # Redirect stderr, hiding error output from exiv2
        context = stdchannel_redirected(sys.stderr, os.devnull)

    metadata_fail = []

    with context:
        with ExifTool() as exiftool_process:
            for full_file_name, ext in test_files:
                if ext.lower() in VIDEO_EXTENSIONS:
                    va = VideoAttributes(full_file_name, ext, exiftool_process)
                    videos.append(va)
                else:
                    # TODO think about how to handle HEIF files!
                    if use_exiftool_on_photo(
                            ext.lower(), preview_extraction_irrelevant=False):
                        pa = ExifToolPhotoAttributes(full_file_name, ext,
                                                     exiftool_process,
                                                     analyze_previews)
                        pa.process(analyze_previews)
                        photos.append(pa)
                    else:
                        try:
                            metadata = mp.MetaData(
                                full_file_name=full_file_name,
                                et_process=exiftool_process,
                            )
                        except:
                            metadata_fail.append(full_file_name)
                        else:
                            pa = PhotoAttributes(full_file_name, ext,
                                                 exiftool_process,
                                                 analyze_previews)
                            pa.metadata = metadata
                            pa.process(analyze_previews)
                            photos.append(pa)

                if have_progressbar and not errors:
                    bar.update()

    if metadata_fail:
        print()
        for full_file_name in metadata_fail:
            print("Could not read metadata from {}".format(full_file_name))

    if outfile is not None:
        if not keep_file_names:
            for pa in photos:
                pa.file_name = None
            for va in videos:
                va.file_name = None

        with open(outfile, "wb") as save_to:
            pickle.dump((photos, videos), save_to, pickle.HIGHEST_PROTOCOL)

    return photos, videos