def filter_redactions_by_pixmap(
    redactions: List[RedactionType],
    page: Page,
) -> List[RedactionType]:
    """Convert each bad redaction to an image and check it for text

    :param redactions: A list of redactions that might be bad
    :param page: The PyMuPDF.Page object where the bad redactions might be
    :return: The redactions, if they are valid
    """
    bad_redactions = []
    for redaction in redactions:
        pixmap = page.get_pixmap(
            # Use gray for simplicity and speed, though this risks missing a
            # bad redaction.
            colorspace=fitz.csRGB,
            clip=fitz.Rect(redaction["bbox"]),
        )
        if not pixmap.is_unicolor:
            # There's some degree of variation in the colors of the pixels.
            # ∴ it's not a uniform box and it's not a bad redaction.
            # filename = f'{redaction["text"].replace("/", "_")}.png'
            # pixmap.save(filename)
            continue
        bad_redactions.append(redaction)
    return bad_redactions
Ejemplo n.º 2
0
def image_from_page(page: fitz.Page, scale: float = 1) -> Image:
    """Converts a page to an image.
    :param page: the page to be represented as an image
    :param scale: the proportion by which to scale the image
    """
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))  # type: ignore
    return Image.frombytes(  # type: ignore
        ("RGBA" if pix.alpha else "RGB"),
        (pix.width, pix.height), pix.samples
    )