def get_results(coco, imgs_dir): """ Get the masking results for all images in `imgs_dir`. :param coco: COCO object representing the dataset. :type coco: COC :param imgs_dir: Path to base directory with images to use for evaluation. :type imgs_dir: str :return: Masking results. Image IDs are keys, and masking results (output from `Masker.mask`) are values. :rtype: dict """ LOGGER.info(__name__, "Building results.") tree_walker = TreeWalker(imgs_dir, [], skip_webp=False, precompute_paths=True) dataset = get_tf_dataset(tree_walker) dataset_iterator = iter(dataset) filename_to_image_id = {img_dict["file_name"]: _id for _id, img_dict in coco.imgs.items()} masker = Masker() results = {} for i, paths in enumerate(tree_walker.walk()): tic = time.time() img = next(dataset_iterator) mask_results = masker.mask(img) image_id = filename_to_image_id[paths.filename] results[image_id] = mask_results dt = time.time() - tic LOGGER.info(__name__, f"Processed image {i+1}/{tree_walker.n_valid_images} in {round(dt, 2)} s. " f"File: {paths.filename}") return results
def _copy_file(source_file, destination_file): if os.path.exists(destination_file): LOGGER.warning( __name__, f"Archive file {destination_file} already exists. The existing file will be " f"overwritten.") copy2(source_file, destination_file)
def wait_until_path_is_found(paths, retry_interval=config.file_access_retry_seconds, timeout=config.file_access_timeout_seconds): """ Blocks execution until all elements of `paths` are valid paths, for `timeout` seconds. If the timeout is reached, and one or more paths still do not exist, a `PathNotReachableError` will be raised. :param paths: Iterable where each element is a string of paths. The elements can also be `bytes`. :type paths: list of str | tuple of str | np.ndarray :param retry_interval: Number of seconds to wait between each retry. :type retry_interval: int :param timeout: Total number of seconds to wait. :type timeout: int :return: 0, if the existence of all paths is confirmed before the timeout is reached. :rtype: int """ total_wait_time = 0 if not isinstance(paths, (list, tuple, np.ndarray)): paths = [paths] while not all_exists(paths): time.sleep(retry_interval) total_wait_time += retry_interval if total_wait_time > timeout: raise PathNotReachableError( f"At least one of the paths in {paths} could not be reached in {timeout}s. " f"Aborting.") else: LOGGER.warning( __name__, f"At least one of the paths in {paths} could not be reached. Retrying." ) return 0
def initialize(): logging.basicConfig(level=logging.DEBUG, format=LOGGER.fmt, datefmt=LOGGER.datefmt) set_excepthook([logger_excepthook]) args = get_args() if args.log_folder is not None: os.makedirs(args.log_folder, exist_ok=True) log_file_name = config.log_file_name.format( datetime=datetime.now().strftime("%Y-%m-%d_%H%M%S"), hostname=gethostname()) log_file = os.path.join(args.log_folder, log_file_name) LOGGER.set_log_file(log_file) tree_walker = TreeWalker(args.input_dir, [], skip_webp=False, precompute_paths=True, ext="json") database_client = DatabaseClient( table_name=args.table_name, max_n_accumulated_rows=config.db_max_n_accumulated_rows, max_n_errors=config.db_max_n_errors, max_cache_size=config.db_max_cache_size, enable_cache=False) return tree_walker, database_client
def initialize(): logging.basicConfig(level=logging.DEBUG, format=LOGGER.fmt, datefmt=LOGGER.datefmt) set_excepthook([logger_excepthook]) args = get_args() if args.log_folder is not None: log_dir = os.path.abspath(args.log_folder) os.makedirs(args.log_folder, exist_ok=True) log_file_name = config.log_file_name.format( datetime=datetime.now().strftime("%Y-%m-%d_%H%M%S"), hostname=gethostname()) log_file = os.path.join(log_dir, log_file_name) LOGGER.set_log_file(log_file) input_dir = os.path.abspath(args.input_folder) output_dir = os.path.abspath(args.output_folder) os.makedirs(args.output_folder, exist_ok=True) tree_walker = TreeWalker(input_dir, [output_dir], skip_webp=False, precompute_paths=True) return tree_walker
def get(self): """ Get the result from the worker. :return: Return value from `self.async_func`. :rtype: """ if self.pool is not None: # Try to get the result from the asynchronous worker. If it raises an exception, handle the exception. try: result = self.async_worker.get() assert self.result_is_valid( result), f"Invalid result: '{result}'" except self.worker_exceptions as err: self.handle_error(err) return ERROR_RETVAL else: # The execution was not run asynchronously, which means that the result is stored in `self.async_worker`. result = self.async_worker LOGGER.debug( __name__, self.finished_message.format(image_file=self.paths.input_file)) return result
def _finish_image(self, paths, exif_result): """ Finish processing for an image. This function will: - (optionally) Write the EXIF data to the database. (If `config.write_exif_to_db == True`.) - Remove the cache file for the image - (optionally) Remove the input image. (If `config.delete_input == True`.) :param paths: Paths object representing the image file. :type paths: src.io.TreeWalker.Paths :param exif_result: JSON metadata file contents. Will be used to write to the database if database writing is enabled. :type exif_result: dict """ # If we have an active database_client, add the EXIF data to the database client. if self.database_client is not None and exif_result is not None: self.database_client.add_row(exif_result) # Remove the cache file paths.remove_cache_file() # Delete the input file? if config.delete_input: wait_until_path_is_found(paths.input_file) os.remove(paths.input_file) LOGGER.debug(__name__, f"Input file removed: {paths.input_file}") # Remove the input folder if it is empty and it's not the base input folder. remove_empty_folders(start_dir=paths.input_dir, top_dir=paths.base_input_dir) self.n_completed += 1
def process_image(self, image, paths): """ Run the processing pipeline for `image`. :param image: Input image. Must be a 4D color image tensor with shape (1, height, width, 3) :type image: tf.python.framework.ops.EagerTensor :param paths: Paths object representing the image file. :type paths: src.io.TreeWalker.Paths """ start_time = time.time() # Compute the detected objects and their masks. mask_results = self.masker.mask(image) time_delta = "{:.3f}".format(time.time() - start_time) LOGGER.info( __name__, f"Masked image in {time_delta} s. File: {paths.input_file}") # Convert the image to a numpy array if not isinstance(image, np.ndarray): image = image.numpy() # If we have reached the maximum number of workers. Wait for them to finish if len(self.workers) >= self.max_num_async_workers: self._wait_for_workers() # Create workers for the current image. self._spawn_workers(paths, image, mask_results)
def get_mappenavn(image_path, exif): dirs = image_path.split(os.sep)[:-1] if config.exif_top_dir in dirs: # Uncomment below for forward-slash separator or backward-slash. rel_path = "/".join(dirs[(dirs.index(config.exif_top_dir) + 1):]) # rel_path = os.sep.join(dirs[(dirs.index(config.exif_top_dir) + 1):]) else: LOGGER.warning( __name__, f"Top directory '{config.exif_top_dir}' not found in image path '{image_path}'. " f"'rel_path' will be empty") rel_path = "" timestamp = iso8601.parse_date(exif["exif_tid"]) format_values = dict(aar=timestamp.year, maaned=timestamp.month, dag=timestamp.day, fylke=str(exif["exif_fylke"]).zfill(2), vegkat=exif["exif_vegkat"], vegstat=exif["exif_vegstat"], vegnr=exif["exif_vegnr"], hp=exif["exif_hp"], meter=exif["exif_meter"], feltkode=exif["exif_feltkode"], strekningreferanse=exif["exif_strekningreferanse"], relative_input_dir=rel_path) folder_name = config.exif_mappenavn.format(**format_values) assert "{" not in folder_name and "}" not in folder_name, f"Invalid `Mappenavn`: {config.db_folder_name} -> " \ f"{folder_name}." return folder_name
def _maybe_restart_worker(self, paths, worker): """ Restart the worker if it has been started less than `self.max_worker_starts` times previously. Otherwise, log an error, and save the error image. :param paths: Paths object representing the image file. :type paths: src.io.TreeWalker.Paths :param worker: Worker to maybe restart :type worker: src.Workers.BaseWorker :return: True if worker was restarted, False otherwise :rtype: bool """ if worker.n_starts > self.max_worker_starts: LOGGER.error( __name__, f"{worker.__class__.__name__} failed for image: {paths.input_file}.", save=True, email=True, email_mode="error") return False else: worker.start() LOGGER.debug( __name__, f"Restarted {worker.__class__.__name__} for image: {paths.input_file}." ) return True
def remove_cache_file(self): if os.path.isfile(self.cache_file): os.remove(self.cache_file) else: LOGGER.warning( __name__, f"Attempted to remove cache file '{self.cache_file}', but is does not exist." )
def ID(json_data): # Try to get 'bildeuuid' from the json_data. image_id = json_data.get("bildeid", None) # If 'bilde_id' could not be found in the json_data. Create it from the contents. if image_id is None: LOGGER.warning(__name__, "Could not find 'bildeid' in JSON data. The ID will be created from the contents of " "the JSON data instead.") image_id = get_deterministic_id(json_data) return image_id
def _init_model(self): """ Initialize the TensorFlow-graph """ saved_model_path = os.path.join(config.MODEL_PATH, "saved_model") # Download and extract model if not os.path.exists(saved_model_path): LOGGER.info(__name__, "Could not find the model graph file. Downloading...") download_model(config.DOWNLOAD_BASE, config.MODEL_NAME, config.MODEL_PATH, extract_all=True) LOGGER.info(__name__, "Model graph file downloaded.") model = tf.saved_model.load(saved_model_path) self.model = model.signatures["serving_default"]
def send_mail(message_type, etype=None, ex=None, tb=None, msg=None): """ Send an email of type `message_type`. The sender, receiver(s) and smtp-server are configured in `email_config.py`. If `--log-folder` is specified to `src.main`, the log-file will be attached to the message. :param message_type: Type of message. This determines the subject and contents of the message. Must be one of - `critical`: This is suitable for critical errors which cause the program to exit abnormally. A critical message requires `etype`, `ex` and `tb` to be specified, and will include the exception type in the subject, and the traceback in the contents. - `error`: This message is suitable for processing errors which do not cause the program to exit. - `finished`: This message type should be used when the program exits normally. :type message_type: str :param etype: Exception type :type etype: type | None :param ex: Exception instance :type ex: BaseException | None :param tb: Traceback object :type tb: traceback.traceback | None :param msg: Message to include in the contents of the email. :type msg: str | None """ # Determine subject if message_type == "critical": msg = "".join(traceback.format_exception(etype, ex, tb)) subject = CRITICAL_SUBJECT.format(etype=etype.__name__, hostname=gethostname()) elif message_type == "error": subject = ERROR_SUBJECT.format(hostname=gethostname()) elif message_type == "finished": subject = FINISHED_SUBJECT.format(hostname=gethostname()) else: raise ValueError( f"Function `email.send_mail` got invalid message type: {message_type}" ) # Create the message message = create_base_message(subject, msg) # Try to send the email. If sending fails, log the message as an error, and continue. try: with smtplib.SMTP(email_config.smtp_host, email_config.port) as smtp: smtp.sendmail(from_addr=email_config.from_address, to_addrs=email_config.to_addresses, msg=message) except Exception as err: LOGGER.error( __name__, f"Got error '{str(err)}' when attempting to send e-mail.")
def clear_cache_file(file_path): """ Clear the output files for the unfinished image whose cahce file is located at `file_path` :param file_path: Path to cache file for unfinished image :type file_path: str """ # Read the JSON file try: with open(file_path, "r") as f: cache_info = json.load(f) except json.JSONDecodeError: # If we got a JSONDecodeError, it was most likely because the program was killed before it finished writing the # file. Since cache file writing is the first step when exporting the output images, we have no output images to # clean up. We therefore remove the (incomplete) cache file and continue. os.remove(file_path) return # Create a `src.io.TreeWalker.Paths` object representing the image paths = Paths(base_input_dir=cache_info["base_input_dir"], base_mirror_dirs=cache_info["base_mirror_dirs"], input_dir=cache_info["input_dir"], mirror_dirs=cache_info["mirror_dirs"], filename=cache_info["filename"]) # Wait for the directories if they cannot be reached try: wait_until_path_is_found( [paths.base_input_dir, *paths.base_mirror_dirs]) except PathNotReachableError as err: raise PathNotReachableError(f"The directories pointed to by the cache file '{file_path}' could not be found. If" f" they were deleted manually, delete this cache file and run the program again")\ from err # Remove any expected output files if they are present for expected_file in get_expected_files(paths): if os.path.isfile(expected_file): os.remove(expected_file) LOGGER.info( __name__, f"Removed file '{expected_file}' for unfinished image '{paths.input_file}'" ) else: LOGGER.debug( __name__, f"Could not find file '{expected_file}' for unfinished image '{paths.input_file}'" ) # Remove the cache file os.remove(file_path)
def _update_rows(self, cursor, rows): LOGGER.info( __name__, f"Attempting to update {len(rows)} row(s) in the database.") # Attempt to update the rows. When we have `batcherrors = True`, the valid rows will be updated normally. cursor.executemany(self.table.update_sql, rows, batcherrors=True) # Get the errors caused by the rows where the update failed. errors = [e for e in cursor.getbatcherrors()] # Add number of updated rows to total counter n_updated = len(rows) - len(errors) self.total_updated += n_updated LOGGER.info( __name__, f"Successfully updated {n_updated} row(s) in the database.") return errors
def clear_cache(): """ Clear the cache directory. Each JSON file in the cache directory is expected to represent an image for which the export process was aborted due to a critical error. This function will clear the output files written for the unfinished image, and then delete the cache file. """ # Return if we couldn't find a cache directory. This probably means that this is the first time the application is # ran on this machine, so the cache directory has not been created yet if not os.path.exists(config.CACHE_DIRECTORY): return LOGGER.info(__name__, "Clearing cache files") count = 0 for filename in os.listdir(config.CACHE_DIRECTORY): if filename.endswith(".json"): clear_cache_file(os.path.join(config.CACHE_DIRECTORY, filename)) count += 1 LOGGER.info(__name__, f"Found and cleared {count} cache file(s)")
def get_exif(img, image_path): """ Parse the EXIF data from `img`. :param img: Input image :type img: PIL.Image :param image_path: Path to input image. Used to recreate metadata when EXIF-header is missing :type image_path: str :return: EXIF data :rtype: dict """ # Make a copy of the template dictionary. Values from the EXIF header will be inserted into this dict. parsed_exif = EXIF_TEMPLATE.copy() # Get the EXIF data exif = img._getexif() if exif is not None: # Convert the integer keys in the exif dict to text labeled = label_exif(exif) # Process the `ImageProperties` XML image_properties_xml = labeled.get("ImageProperties", None) assert image_properties_xml is not None, "Unable to get key 40055:`ImageProperties` from EXIF." process_image_properties(image_properties_xml, parsed_exif) # Process the `ReflinkInfo` XML if it is available reflink_info_xml = labeled.get("ReflinkInfo", None) process_reflink_info(reflink_info_xml, parsed_exif) # Title of image. XPTitle = labeled.get("XPTitle", b"").decode("utf16") parsed_exif["exif_xptitle"] = XPTitle else: LOGGER.warning( __name__, "No EXIF data found for image. Attempting to reconstruct data from image path." ) if image_path is not None: get_metadata_from_path(image_path, parsed_exif) # Get a deterministic ID from the exif data. parsed_exif["bildeid"] = get_deterministic_id(parsed_exif) # Insert the folder name parsed_exif["mappenavn"] = get_mappenavn(image_path, parsed_exif) return parsed_exif
def _insert_rows(self, cursor, rows): LOGGER.info( __name__, f"Attempting to insert {len(rows)} row(s) into the database.") # Attempt to insert the rows into the database. When we have `batcherrors = True`, the rows which do not # violate the unique constraint will be inserted normally. The rows which do violate the constraint will # not be inserted. cursor.executemany(self.table.insert_sql, rows, batcherrors=True) # Get the indices of the rows where the insertion failed. errors = [e for e in cursor.getbatcherrors()] # Add number of inserted rows to total counter n_inserted = len(rows) - len(errors) self.total_inserted += n_inserted LOGGER.info( __name__, f"Successfully inserted {n_inserted} row(s) into the database.") return errors
def handle_errors(self, errors, rows, action="writing to"): """ Log errors caused when running `cursor.executemany`. :param errors: Errors from `cursor.getbatcherrors` :type errors: list :param rows: Rows which caused the errors :type rows: list of dict :param action: Optional database action for the error message. :type action: str """ # Increment total error counter self.total_errors += len(errors) # Create an error message msg = f"Got {len(errors)} error(s) while {action} the database:\n" msg += "\n".join([err.message for err in errors]) # Log the error LOGGER.error(__name__, msg, save=False, email=True, email_mode="error")
def check_all_files_written(paths): """ Check that all files for a given image have been saved correctly. The list of checked files is determined by the File I/O parameters in `config`. If all expected output files exist, the cache file will be deleted. If all expected output files exist, AND `config.delete_input` is True, the input image will be deleted as well. :param paths: Paths object representing the input image :type paths: src.io.TreeWalker.Paths :return: True if all expected files were found. False otherwise :rtype: bool """ missing_files = find_missing_files(paths) if missing_files: _handle_missing_files(paths, missing_files) return False else: LOGGER.info(__name__, f"All output files written for image: {paths.input_file}") return True
def masker_category_to_annotation_category(masker_cat, coco): """ Convert from masker category to annotation category, using the category name. :param masker_cat: Masker category :type masker_cat: int :param coco: COCO object representing the dataset :type coco: COCO :return: Annotation category :rtype: int """ masker_cat_name = LABEL_MAP[int(masker_cat)] for _id, cat_dict in coco.cats.items(): if cat_dict["name"] == masker_cat_name: return _id LOGGER.info(__name__, f"Category {masker_cat} ({masker_cat_name}) not found in annotations. This detection will be " f"ignored.") return None
def remove_empty_folders(start_dir, top_dir): """ Bottom-up removal of empty folders. If `start_dir` is empty, it will be removed. If `start_dir`'s parent directory is empty after removing `start_dir`, it too will be removed. This process i continued until a parent is non-empty, or the current directory is equal to `top_dir`. (The `top_dir` directory will not be removed). NOTE: Use full paths when using this function, to avoid problems when comparing the current directory to `top_dir`. :param start_dir: Path to bottom directory to remove if empty. :type start_dir: str :param top_dir: Top directory. Only folders under this will be deleted. :type top_dir: """ assert start_dir.startswith(top_dir), f"remove_empty_folders: Invalid top directory '{top_dir}' for start " \ f"directory '{start_dir}'" current_dir = start_dir while not os.listdir(current_dir) and current_dir != top_dir: os.rmdir(current_dir) LOGGER.debug(__name__, f"Input folder removed: {current_dir}") current_dir = os.path.dirname(current_dir)
def check_config(args): """ Check that the specified configuration variables are valid. """ if config.archive_json and not config.remote_json: raise ValueError("Parameter 'archive_json' requires remote_json=True.") if config.archive_mask and not config.remote_mask: raise ValueError("Parameter 'archive_mask' requires remote_mask=True.") if config.delete_input: LOGGER.warning( __name__, "Parameter 'delete_input' is enabled. This will permanently delete the original" " image from the input directory!") assert args.archive_folder, "Argument 'delete_input' requires a valid archive directory to be specified." if config.uncaught_exception_email or config.processing_error_email or config.finished_email: # Try to import the email_sender module, which checks if the `email_config.py` file is present. # Otherwise this will raise an exception prompting the user to create the file. import src.email_sender valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"] assert config.log_level in valid_log_levels, f"config.log_level must be one of {valid_log_levels}"
def create_row(self, json_dict): """ Create a database row from the given `json_dict`. :param json_dict: EXIF data :type json_dict: dict :return: Dict representing the database row. :rtype: dict """ out = {} for col in self.columns: try: value = col.get_value(json_dict) except Exception as err: LOGGER.warning( __name__, f"Got error '{type(err).__name__}: {err}' while getting value for database " f"column {col.name}. Value will be set to None") value = None out[col.name] = value return out
def __init__(self, input_folder, mirror_folders, skip_webp=True, precompute_paths=True, ext="jpg"): LOGGER.info(__name__, f"Searching for {ext}-files in '{input_folder}'.") self.input_folder = input_folder self.mirror_folders = mirror_folders self.skip_webp = skip_webp self.precompute_paths = precompute_paths self.ext = ext self.n_valid_images = self.n_skipped_images = 0 if self.precompute_paths: self.paths = [p for p in self._walk()] LOGGER.info(__name__, f"Found {self.n_valid_images} valid {ext}-files.") if self.n_skipped_images > 0: LOGGER.info( __name__, f"Found {self.n_skipped_images} files with associated webp-files. " f"These will be skipped.") else: self.paths = None
def _path_is_valid(self, input_dir, mirror_dirs, filename): if not filename.endswith(self.ext): return False input_filepath = os.path.join(input_dir, filename) if not os.access(input_filepath, os.R_OK): LOGGER.info(__name__, f"Could not read image file '{input_filepath}'") return False if self.skip_webp: webp_path = os.path.join(mirror_dirs[0], self._to_webp(filename)) if os.path.exists(webp_path): LOGGER.debug( __name__, f"Mask already found for '{input_filepath}' at '{webp_path}'." ) self.n_skipped_images += 1 return False self.n_valid_images += 1 return True
def clear_db_cache(): """ Traverse the database cache directory and insert all cached rows into the database. If insertion was successful, the cache files will be deleted. """ if not os.path.isdir(DB_CACHE_DIR): return rows = [] files = [] for filename in os.listdir(DB_CACHE_DIR): if not filename.endswith(".pkl"): continue cache_file = os.path.join(DB_CACHE_DIR, filename) LOGGER.debug(__name__, f"Found database cache file: {cache_file}") # Load the cached row and append it to `rows` with open(cache_file, "rb") as f: rows.append(pickle.load(f)) # Store the path to the cached row files.append(cache_file) # Return if we didn't find any valid rows. if not rows: return # Attempt to insert the rows into the database with DatabaseClient() as cli: cli._ignore_error_check = True try: cli.insert_or_update_rows(rows) except Exception as err: raise DatabaseError( f"Got error '{err}' when inserting cached rows into the database." ) from err # Remove the cache files for cache_file in files: os.remove(cache_file)
def main(): tree_walker = initialize() for i, paths in enumerate(tree_walker.walk()): count_str = f"{i + 1} of {tree_walker.n_valid_images}" LOGGER.info(__name__, LOG_SEP) LOGGER.info(__name__, f"Iteration: {count_str}.") LOGGER.info(__name__, f"Processing file {paths.input_file}") try: worker = EXIFWorker(None, paths, None) worker.get() except PROCESSING_EXCEPTIONS as err: LOGGER.error( f"Got error '{type(err).__name__}: {str(err)}' when creating JSON from image. " f"File: {paths.input_file}")
def insert_accumulated_rows(self): """ Insert all accumulated rows into the database """ try: # Insert the rows self.insert_or_update_rows(self.accumulated_rows) # Clear the list of accumulated rows self.accumulated_rows = [] if self.enable_cache: # Delete the cached files while self.cached_rows: cache_file = self.cached_rows.pop(0) if os.path.exists(cache_file): os.remove(cache_file) else: LOGGER.warning( __name__, f"Could not find cache file to remove: {cache_file}" ) except cxo.DatabaseError as err: raise DatabaseError(f"cx_Oracle.DatabaseError: {str(err)}")