def set_up_meta(self, control_sum): writer = pdfrw.PdfWriter() for page in pdfrw.PdfReader(self.path).pages: writer.addPage(page) writer.trailer.Info = pdfrw.IndirectPdfDict(Owner=self.user_id, ControlSum=control_sum, SignedBy='') writer.write(self.path)
def make_output(self): # generate the output string ostr = libxmp.core._remove_trailing_whitespace( self.md.serialize_to_str().replace("\ufeff", "")) # assemble the output dictionary output_dict = pdfrw.IndirectPdfDict(Type=pdfrw.PdfName("Metadata"), Subtype=pdfrw.PdfName("XML")) output_dict.stream = ostr.encode("utf-8").decode("latin-1") return output_dict
def pdfInfo(self): return pdfrw.IndirectPdfDict( Title=self.title, Author=self.author, Subject=self.subject, Keywords=self.keywords, Creator=self.creator, Producer=self.producer, CreationDate=self.pdftime(), ModDate=self.pdftime(), )
def __init__(self): # load the sRGB2014 ICC color profile iccpath = pathlib.Path( __file__).absolute().parent / "icc" / "sRGB2014.icc" srgb = ImageCms.getOpenProfile(str(iccpath)) # construct the correct pdf dict. first the output profile # N=3 is required for RGB colorspaces op = pdfrw.IndirectPdfDict(N=3, Alternate=pdfrw.PdfName("DeviceRGB")) op.stream = srgb.tobytes().decode("latin-1") # then the outputintents array oi = pdfrw.IndirectPdfDict( Type=pdfrw.PdfName("OutputIntent"), S=pdfrw.PdfName("GTS_PDFA1"), OutputConditionIdentifier="sRGB", DestOutputProfile=op, Info=srgb.profile.profile_description, # I am not sure whether this is correct, but it doesn't fail RegistryName="http://color.org/srgbprofiles.xalter") self.output_intent = [oi]
return template_pdf def build_datadict(in_file): o = [] with open(in_file) as file: reader = csv.DictReader(file, delimiter=',') for row in reader: m = {} for f in FIELDS: if row[f] and not row[f].isspace() and not row[f] is None: m[f] = row[f] if m: m['Date'] = "January 25th, 2020" o.append(m) return o if __name__ == '__main__': data = build_datadict(IN_FILE) writer = pdfrw.PdfWriter() writer.trailer.Info = pdfrw.IndirectPdfDict(Title='Combined PDF') # Iterate array of 'data_dict's for d in data: this_pages = modify_form(TEMPLATE_FILE, d) # fill the form this_pages.Root.AcroForm.update( pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject( 'true'))) # maintain appearances writer.addpages(this_pages.pages) # merge into single pdf writer.write(IN_FILE.split(".")[0] + ".pdf")
flag = sys.argv[1] if flag == "-c": # combine import pdfrw import natsort writer = pdfrw.PdfWriter() for file in natsort.natsorted(os.listdir(f"{script_root}/data")): if file.endswith(".pdf"): print(file) pdf_path = f"{script_root}/data/{file}" writer.addpages(pdfrw.PdfReader(pdf_path).pages) writer.trailer.Info = pdfrw.IndirectPdfDict(Title="") writer.write(f"{script_root}/data/combined.pdf") elif flag == "-o": # ocr import ocrmypdf for file in os.listdir(f"{script_root}/data"): if file.endswith(".pdf"): print(file) pdf_path = f"{script_root}/data/{file}" pid = os.fork() if pid > 0: # parent process os.waitpid(pid, 0) # wait for child process to end elif pid == 0: # child process ocrmypdf.ocr( input_file=pdf_path,