コード例 #1
0
	def test_metadata(self):
		options = pdf_redactor.RedactorOptions()
		options.metadata_filters = {
			"Title": [lambda value: value.replace("test", "sentinel")],
			"Subject": [lambda value: value[::-1]],
			"DEFAULT": [lambda value: None],
		}
		with RedactFixture(FIXTURE_PATH, options) as redacted_path:
			metadata = subprocess.check_output(["pdfinfo", redacted_path])
			self.assertIn(b"this is a sentinel", metadata)
			self.assertIn(b"FDP a si", metadata)
			self.assertNotIn(b"CreationDate", metadata)
			self.assertNotIn(b"LibreOffice", metadata)
コード例 #2
0
	def test_text_ssns(self):
		options = pdf_redactor.RedactorOptions()
		options.content_filters = [
			(
				re.compile(u"[−–—~‐]"),
				lambda m: "-"
			),
			(
				re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"),
				lambda m: "XXX-XX-XXXX"
			),
		]
		with RedactFixture(FIXTURE_PATH, options) as redacted_path:
			text = pdf_to_text(redacted_path)
			self.assertIn("Here are some fake SSNs\n\nXXX-XX-XXXX\n--\n\nXXX-XX-XXXX XXX-XX-XXXX\n\nAnd some more with common OCR character substitutions:\nXXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX", text)
コード例 #3
0
	def test_xmp(self):
		options = pdf_redactor.RedactorOptions()
		options.metadata_filters = {
			"DEFAULT": [lambda value: None],
		}
		def xmp_filter(doc):
			for elem in doc.iter():
				if elem.text == "Writer":
					elem.text = "Sentinel"
			return doc
		options.xmp_filters = [xmp_filter]
		with RedactFixture(FIXTURE_PATH, options) as redacted_path:
			metadata = subprocess.check_output(["pdfinfo", "-meta", redacted_path])
			self.assertIn(b"Sentinel", metadata)
			self.assertNotIn(b"Writer", metadata)
コード例 #4
0
	def test_comment(self):
		options = pdf_redactor.RedactorOptions()
		options.content_filters = [
			# replacement for the comment text
			(
				re.compile(re.escape(u"I have a comment!")),
				lambda m: "all gone"
			),

			# replacement for the comment title
			(
				re.compile(re.escape(u"Unknown Author")),
				lambda m: "Some Person"
			),
		]
		with RedactFixture(FIXTURE_PATH, options) as redacted_path:
			text = pdf_to_text(redacted_path)
コード例 #5
0
ファイル: smoketest.py プロジェクト: vitalbeats/pdf-redactor
def smoke_test_file(path):
    options = pdf_redactor.RedactorOptions()
    options.input_stream = open(path, "rb")
    options.output_stream = io.BytesIO()
    options.content_filters = [(re.compile("\w+"),
                                lambda match: match.group(0))]
    options.metadata_filters = {"ALL": [metadata_filter]}
    try:
        pdf_redactor.redactor(options)
    except (pdfrw.errors.PdfParseError, IndexError, AssertionError,
            xml.etree.ElementTree.ParseError, TypeError, AttributeError,
            StopIteration, ValueError) as e:
        print("{0} while reading {1}".format(e.__class__.__name__, path),
              file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    finally:
        options.input_stream.close()
コード例 #6
0
ファイル: test_redactor.py プロジェクト: hibellm/pdf-redactor
    def test_link(self):
        options = pdf_redactor.RedactorOptions()
        options.content_filters = [
            # replacement for the link text
            (re.compile(re.escape(u"link to issue #13")),
             lambda m: "this link was removed"),
        ]
        options.link_filters = [
            lambda href, annotation: "https://www.google.com"
        ]
        with RedactFixture(FIXTURE_PATH, options) as redacted_path:
            text = pdf_to_text(redacted_path)
            self.assertNotIn("link to issue #13", text)
            self.assertIn("this link was re#o#e#", text)  # glyph replacements

            html = pdf_to_html(redacted_path)
            self.assertNotIn("github", html)
            self.assertIn('href="https://www.google.com"', html)
コード例 #7
0
# Example file to print the text layer of a PDF.

import re, io, sys

import pdf_redactor

## Set options.


def printer(m):
    s = m.group(0)
    if sys.version_info < (3, ):
        s = s.encode("utf8")
    print(s)
    return ""


options = pdf_redactor.RedactorOptions()
options.output_stream = io.BytesIO()  # null
options.content_filters = [(re.compile("[\w\W]+"), printer)]
pdf_redactor.redactor(options)
コード例 #8
0
def redact(fname, searchlist):
    """Lets Redact the pdf
    """
    #fname = sys.argv[1]  # filename

    doc = fitz.open(fname)

    doc.setMetadata({})    # clear metadata
    doc._delXmlMetadata()  # clear any XML metadata

    new_doc = False  # indicator if anything found at all

    for page in doc:  # scan through the pages

        for word in searchlist:
            print(f"Redacting word {word} in document {doc.name}")
            found = mark_word(page, word)  # mark the page's words
            if found:  # if anything found ...
                new_doc = True
                print("found '%s' %i times on page %i" % (word, found, page.number + 1))

    if new_doc:
        doc.save("marked-" + str(doc.name).split('/')[-1])


    import re
    from datetime import datetime
    import pdf_redactor

    ## Set options.

    options = pdf_redactor.RedactorOptions()
    options.metadata_filters = {
        # Perform some field filtering --- turn the Title into uppercase.
        "Title": [lambda value: value.upper()],

        # Set some values, overriding any value present in the PDF.
        "Producer": [lambda value: "My Name"],
        "CreationDate": [lambda value: datetime.utcnow()],

        # Clear all other fields.
        "DEFAULT": [lambda value: None],
    }

    # Clear any XMP metadata, if present.
    options.xmp_filters = [lambda xml: None]

    # Redact things that look like social security numbers, replacing the
    # text with X's.
    options.content_filters = [
        # First convert all dash-like characters to dashes.
        (
            # re.compile(u"[−–—~‐]"),
            # lambda m : "-"
            re.compile(u"LibreOffice"),
            lambda m: "X"
        ),

        # Then do an actual SSL regex.
        # See https://github.com/opendata/SSN-Redaction for why this regex is complicated.
        # (
        # 	re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"),
        # 	lambda m : "XXX-XX-XXXX"
        # ),

        # Content filter that runs on the text comment annotation body.
        (
            re.compile(r"comment!"),
            lambda m: "annotation?"
        )#,
    ]

    # Filter the link target URI.
    options.link_filters = [
        lambda href, annotation: "https://www.google.com"
    ]

    # Perform the redaction using PDF on standard input and writing to standard output.
    pdf_redactor.redactor(options)






# THE SEARCH STRING LIST
# searchlist = ['Marcus', 'Hibell', 'Lorem', 'Hampden-Sydney', 'College', 'loves']
# redact('Lorem.pdf',searchlist)









# WORKING VERSION FOR A SINGLE WORD
# fname = sys.argv[1]                    # filename
# text = sys.argv[2]                     # search string
# doc = fitz.open(fname)

# print("underlining words containing '%s' in document '%s'" % (text, doc.name))

# new_doc = False                        # indicator if anything found at all

# for page in doc:                       # scan through the pages
#     found = mark_word(page, text)      # mark the page's words
#     if found:                          # if anything found ...
#         new_doc = True
#         print("found '%s' %i times on page %i" % (text, found, page.number + 1))

# if new_doc:
#     doc.save("marked-" + str(doc.name).split('/')[-1])