def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) -> dict: """ Args: series: series to summarize exif: extract exif information hash: calculate hash (for duplicate detection) Returns: """ image_information = series.apply( partial(extract_image_information, exif=exif, hash=hash)) summary = { "n_truncated": sum([ 1 for x in image_information if "truncated" in x and x["truncated"] ]), "image_dimensions": pd.Series( [x["size"] for x in image_information if "size" in x], name="image_dimensions", ), } image_widths = summary["image_dimensions"].map(lambda x: x[0]) summary.update(named_aggregate_summary(image_widths, "width")) image_heights = summary["image_dimensions"].map(lambda x: x[1]) summary.update(named_aggregate_summary(image_heights, "height")) image_areas = image_widths * image_heights summary.update(named_aggregate_summary(image_areas, "area")) if hash: summary["n_duplicate_hash"] = count_duplicate_hashes(image_information) if exif: exif_series = extract_exif_series( [x["exif"] for x in image_information if "exif" in x]) summary["exif_keys_counts"] = exif_series["exif_keys"] summary["exif_data"] = exif_series return summary
def length_summary(series: pd.Series) -> dict: length = series.str.len() summary = {"length": length} summary.update(named_aggregate_summary(length, "length")) return summary