def pdf_redact(input_stream, output_directory, strings_to_filter): path, filename = os.path.split(input_stream) output_stream = output_directory + filename options = pdf_redactor.RedactorOptions() options.input_stream = input_stream options.output_stream = output_stream # Clear any XMP metadata, if present. options.xmp_filters = [lambda xml : None] # Redact things that look like social security numbers, replacing the # text with X's. options.content_filters = [ ( re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{0,8})"), lambda m : 10 * "X" ), ( re.compile(r'[,a-zA-Z0-9 ]+[A-Za-z]{1,2}[0-9R][0-9A-Za-z]? [0-9][A-Za-z]{2}'), lambda m : 15 * "X" ), ( re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), lambda m : 10 * "X" ), ( re.compile("Park,|Road,|Hill,|Lane,|London,|Avenue,|Essex,|Green,|Way|Bristol,|Manchester,"), lambda m : 8 * "X" ), ( re.compile(r'[\d]{1,2} +[a-zA-Z]{1,15} +(?:Park|Road|Hill|Lane|London|Green|Avenue|Green|Way)'), lambda m : 8 * "X" ), ( re.compile("\+? {0,2}\d+ {0,2}[(-]?\d(?:[ \d]*\d)?[)-]? {0,2}\d+[/ -]?\d+[/ -]?\d+(?: *- *\d+)?"), lambda m : 10 * "X" ), ] for string in strings_to_filter: options.content_filters.append( ( re.compile(string), lambda m : 4*"X" ), ) # Perform the redaction using PDF on standard input and writing to standard output. pdf_redactor.redactor(options) return output_stream
def __enter__(self): self.input_file = open(self.input_path, "rb") self.options.input_stream = self.input_file fd, self.redacted_path = tempfile.mkstemp(".pdf") self.redacted_file = os.fdopen(fd, "wb") self.options.output_stream = self.redacted_file pdf_redactor.redactor(self.options) self.redacted_file.close() return self.redacted_path
def smoke_test_file(path): options = pdf_redactor.RedactorOptions() options.input_stream = open(path, "rb") options.output_stream = io.BytesIO() options.content_filters = [(re.compile("\w+"), lambda match: match.group(0))] options.metadata_filters = {"ALL": [metadata_filter]} try: pdf_redactor.redactor(options) except (pdfrw.errors.PdfParseError, IndexError, AssertionError, xml.etree.ElementTree.ParseError, TypeError, AttributeError, StopIteration, ValueError) as e: print("{0} while reading {1}".format(e.__class__.__name__, path), file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) finally: options.input_stream.close()
def clean_pdf(in_file, out_file, file_metadata, author_names): from pdf_redactor import redactor, RedactorOptions import io, re, subprocess, tempfile, shutil # Form a regex for author names, replacing spaces with optional whitespace. author_name_regex = "|".join( r"\s?".join(re.escape(an1) for an1 in an.split(" ")) for an in author_names ) # Set redaction options. redactor_options = RedactorOptions() redactor_options.metadata_filters = { # Copy from report metadata. "Title": [lambda value : file_metadata['title']], "Author": [lambda value : "Congressional Research Service, Library of Congress, USA"], "CreationDate": [lambda value : file_metadata['date']], # Set these. "Producer": [lambda value : "EveryCRSReport.com"], "ModDate": [lambda value : datetime.datetime.utcnow()], # Clear all other fields. "DEFAULT": [lambda value : None], } # Clear XMP. redactor_options.xmp_filters = [lambda xml : None] # Redact phone numbers, email addresses, and author names. # See the notes on the regular expressions above for the HTML scrubber. redactor_options.content_filters = [ (re.compile("((^|[^\d])7-)\d{4}"), lambda m : m.group(1) + "...."), # use a symbol likely to be available (re.compile("\(\d\d\d\) \d\d\d-\d\d\d\d"), lambda m : "[redacted]"), # use a symbol likely to be available (re.compile("[a-zA-Z0-9_!#\$%&\'\*\+\-/=\?\^`\{\|\}~]+(@crs.?(loc|gov))"), lambda m : ("[redacted]" + m.group(1))), (re.compile(author_name_regex), lambda m : "(name redacted)"), ] # Avoid inserting ?'s and spaces. redactor_options.content_replacement_glyphs = ['#', '*', '/', '-'] # Run qpdf to decompress. data = subprocess.check_output(['qpdf', '--qdf', '--stream-data=uncompress', in_file, "-"]) with tempfile.NamedTemporaryFile() as f1: with tempfile.NamedTemporaryFile() as f2: # Run the redactor. Since qpdf in the next step requires an actual file for the input, # write the output to a file. redactor_options.input_stream = io.BytesIO(data) redactor_options.output_stream = f1 try: redactor(redactor_options) except: # The redactor has some trouble on old files. Post them anyway. if file_metadata['date'] < "2003-01-01": print("Writing", out_file, "without redacting.") f1.seek(0) f1.write(data) else: raise f1.flush() # Linearize and add our own page to the end of the PDF. The qpdf command # for this is pretty weird. All we're doing is appending a page. import subprocess subprocess.check_call(['qpdf', '--linearize', f1.name, "--pages", f1.name, "branding/pdf-addendum-page.pdf", "--", f2.name]) # Copy the final PDF to the output location. We don't write directly to # out_file in the previous qpdf step in case of errors. If there's an # error during writing, let's not leave a broken file. shutil.copyfile(f2.name, out_file) # Generate a thumbnail image of the PDF. # Note that pdftoppm adds ".png" to the end of the file name. subprocess.check_call(['pdftoppm', '-png', '-singlefile', '-scale-to-x', '600', '-scale-to-y', '-1', out_file, out_file.replace(".pdf", "")])
# Example file to print the text layer of a PDF. import re, io, sys import pdf_redactor ## Set options. def printer(m): s = m.group(0) if sys.version_info < (3, ): s = s.encode("utf8") print(s) return "" options = pdf_redactor.RedactorOptions() options.output_stream = io.BytesIO() # null options.content_filters = [(re.compile("[\w\W]+"), printer)] pdf_redactor.redactor(options)
def redact(fname, searchlist): """Lets Redact the pdf """ #fname = sys.argv[1] # filename doc = fitz.open(fname) doc.setMetadata({}) # clear metadata doc._delXmlMetadata() # clear any XML metadata new_doc = False # indicator if anything found at all for page in doc: # scan through the pages for word in searchlist: print(f"Redacting word {word} in document {doc.name}") found = mark_word(page, word) # mark the page's words if found: # if anything found ... new_doc = True print("found '%s' %i times on page %i" % (word, found, page.number + 1)) if new_doc: doc.save("marked-" + str(doc.name).split('/')[-1]) import re from datetime import datetime import pdf_redactor ## Set options. options = pdf_redactor.RedactorOptions() options.metadata_filters = { # Perform some field filtering --- turn the Title into uppercase. "Title": [lambda value: value.upper()], # Set some values, overriding any value present in the PDF. "Producer": [lambda value: "My Name"], "CreationDate": [lambda value: datetime.utcnow()], # Clear all other fields. "DEFAULT": [lambda value: None], } # Clear any XMP metadata, if present. options.xmp_filters = [lambda xml: None] # Redact things that look like social security numbers, replacing the # text with X's. options.content_filters = [ # First convert all dash-like characters to dashes. ( # re.compile(u"[−–—~‐]"), # lambda m : "-" re.compile(u"LibreOffice"), lambda m: "X" ), # Then do an actual SSL regex. # See https://github.com/opendata/SSN-Redaction for why this regex is complicated. # ( # re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"), # lambda m : "XXX-XX-XXXX" # ), # Content filter that runs on the text comment annotation body. ( re.compile(r"comment!"), lambda m: "annotation?" )#, ] # Filter the link target URI. options.link_filters = [ lambda href, annotation: "https://www.google.com" ] # Perform the redaction using PDF on standard input and writing to standard output. pdf_redactor.redactor(options) # THE SEARCH STRING LIST # searchlist = ['Marcus', 'Hibell', 'Lorem', 'Hampden-Sydney', 'College', 'loves'] # redact('Lorem.pdf',searchlist) # WORKING VERSION FOR A SINGLE WORD # fname = sys.argv[1] # filename # text = sys.argv[2] # search string # doc = fitz.open(fname) # print("underlining words containing '%s' in document '%s'" % (text, doc.name)) # new_doc = False # indicator if anything found at all # for page in doc: # scan through the pages # found = mark_word(page, text) # mark the page's words # if found: # if anything found ... # new_doc = True # print("found '%s' %i times on page %i" % (text, found, page.number + 1)) # if new_doc: # doc.save("marked-" + str(doc.name).split('/')[-1])