def process_content_object(self, objects): from PyPDF2.pdf import ContentStream content = ContentStream(objects, self.finder) last_id = None last_font = None if content is not None: for operands, operator in content.operations: text = u_("") curr_id = self.get_id(operands) if curr_id is not None: last_id = curr_id elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text if last_id is not None and text: self.table[last_id] += text self.strip_table_spaces()
def getTextByPage(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text.replace("\n\n", "\n")
def extract_text_objects(page): """Yields a sequence of TextStringObject instances from a given PageObject, in whatever order the internal content stream chooses to emit them. Note that the order may change as the PyPDF2 package evolves. Adapted directly from the extractText method of the PageObject class from PyPDF2.pdf.""" content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_("T*"): yield "\n" elif operator == b_("'"): yield "\n" _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): yield "\n" yield _text elif operator == b_("TJ"): for x in operands[0]: if isinstance(x, TextStringObject): yield x yield "\n"
def extractTextList(self): text_list = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject) and len(_text.strip()): text_list.append(_text.strip()) elif operator == b_("T*"): pass elif operator == b_("'"): pass _text = operands[0] if isinstance(_text, TextStringObject) and len(operands[0]): text_list.append(operands[0]) elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject) and len(_text): text_list.append(_text) elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject) and len(i): text_list.append(i) return text_list
def convert_page_to_text(page): ''' This function will copied from PyPDF2 extractText method. ''' text = u_("") content = page.getContents() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + ' ' elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] + ' ' elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text + ' ' elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i + ' ' text += "\n" return text
def InsertXObject(self, name): """ XObject can be an image or a 'form' (an arbitrary PDF sequence). """ dlist = [] xobject = self.page["/Resources"].getObject()['/XObject'] stream = xobject[name] if stream.get('/Subtype') == '/Form': # insert contents into current page drawing if not name in self.formdrawings: # extract if not already done pdf_fonts = self.FetchFonts(stream) x_bbox = stream.get('/BBox') matrix = stream.get('/Matrix') form_ops = ContentStream(stream, self.pdfdoc).operations oplist = [([], 'q'), (matrix, 'cm')] # push state & apply matrix oplist.extend(form_ops) # add form contents oplist.append(([], 'Q')) # restore original state self.formdrawings[name] = self.ProcessOperators(oplist, pdf_fonts) dlist.extend(self.formdrawings[name]) elif stream.get('/Subtype') == '/Image': width = stream['/Width'] height = stream['/Height'] x_depth = stream['/BitsPerComponent'] filters = stream["/Filter"] item = self.AddBitmap(stream._data, width, height, filters) if item: # may be unimplemented dlist.append(item) return dlist
def extractOperators(self): ops = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for op in content.operations: ops.append(op) return ops
def extractOperators(self): """ Locate and return all commands in the order they occur in the content stream. Used by pdfviewer. """ ops = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for op in content.operations: ops.append(op) return ops
def search(self): from PyPDF2.pdf import ContentStream for num in range(self.pdf.getNumPages()): page = self.pdf.getPage(num) self.converter.process_fonts(num, page) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page) last_font = None last_x = None last_y = None re = None # re = rectangle for operands, operator in content.operations: text = u_("") if operator == b_("re"): re = operands elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("Td"): # text coordinates last_x, last_y = operands elif operator == b_("cm"): # text coordinates *_, last_x, last_y = operands if text: # print(text) self.tables.process(re, text, last_x, last_y) # re = None CashObject().clean() return self.tables.get_tables()
def process_content(self): for page_num in range(self.reader.getNumPages()): page = self.reader.getPage(page_num) content_object = page["/Contents"].getObject() content = ContentStream(content_object, self.reader) for operands, operator in content.operations: if operator == b_("TJ") or operator == b_("Tj"): text = operands[0] if any_match(text, self.remove_list): print(text) operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) self.writer.addPage(page)
def extractOperators(self): """ Locate and return all commands in the order they occur in the content stream """ ops = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for op in content.operations: if type(op[1] == bytes): op = (op[0], op[1].decode()) ops.append(op) return ops
def removeWatermark(self): #for pageNum in range(self.pdfObj.getNumPages()): print(self.pdfObj.getDocumentInfo()) for pageNum in range(399, 400): page = self.pdfObj.getPage(pageNum) print(page.extractText().encode('latin-1')) contentObj = page["/Contents"].getObject() content = ContentStream(contentObj, self.pdfObj) for opr, opt in content.operations: if opt == b_("TJ"): txt = opr[0][0] if isinstance(txt, TextStringObject): pass page.__setitem__(NameObject('/Contents'), content) print("\n\n")
def swapColor(self, pageIndex, fromColor, toColor): """ Substitutes all the color switching operators with fromColor with toColor. :param pageIndex: index of evaluated page :param fromColor: color which will be substituted :param toColor: destination color :return: """ if pageIndex >= self.getNumPages(): print("That page doesn't exist") return print("Evaluating page no. %d..." % pageIndex) page = self.getPage(pageIndex) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) swap_counter = 0 for index, val in enumerate(content.operations): operands = val[0] operator = val[1] should_swap = False if operator == b_("cs"): self.printDebug("Nonstroking color space.") elif operator in self.operators: if len(operands) == 3: should_swap = self._evaluateColor3(operator, operands, fromColor) elif len(operands) == 1: should_swap = self._evaluateColor1(operator, operands, fromColor) elif len(operands) == 4: should_swap = self._evaluateColor4(operator, operands, fromColor) # evaluating should swap if should_swap: swap_counter += 1 if self._removeCSRef(content, index): self._swapColorCmd(content, index - 1, toColor) else: self._swapColorCmd(content, index, toColor) key = NameObject("/Contents") page[key] = content page.compressContentStreams() print("Replaced %d references of given color.\n" % swap_counter)
def extractText_patch(self): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i elif isinstance(i, NumberObject) and i < -125: text += " " text += "\n" return text
def extract_text_blocks(self) -> list: """ Every text block in pdf begins with BT (Begin text) and ends with ET (end text) Get what is in between and return it. :return: """ text = [] content = self.pageObject["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pageObject.pdf) start, end = 0, 0 for index, (operands, operator) in enumerate(content.operations): if operator == utils.b_("BT"): start = index if operator == utils.b_("ET"): end = index if start != 0 and end != 0: text.append(content.operations[start + 1:end]) start, end = 0, 0 return text
def dopage(page): content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, pdf) text = u_("") for operands, operator in content.operations: # print operator, operands if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + " " elif operator == b_("rg"): text += "\n" elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] + " " elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += _text + " " elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += " " texts = text.split('. ') results = '' for i in range(len(texts)): try: results = results + translate(str(texts[i])) + "\n" except Exception as e: print e return results
def read_unicode(self, obj): from PyPDF2.pdf import ContentStream content = None table = {} if not isinstance(obj, ContentStream): try: content = ContentStream(obj, obj.pdf) except ValueError: pass if content is not None: for operands, operator in content.operations: if operator == b'endbfchar' or operator == b'endbfrange': count_el = 2 if operator == b'endbfchar' else 3 # table has two or three elements for index in range(0, len(operands), count_el): key = operands[index] if not isinstance(key, ByteStringObject): key = key.get_original_bytes() key = key.hex() value = operands[index + count_el - 1] if not isinstance(value, ByteStringObject): value = value.get_original_bytes() value = convert(value) table[key] = value if count_el == 3 and operands[index] != operands[index + 1]: start = operands[index].get_original_bytes().hex() end = operands[index + 1].get_original_bytes().hex() # иногда указан диапазон значений, поэтому таблицу шрифтов # дополняем динамически for i in range( int(start, 16) + 1, int(end, 16) + 1): key = hex(i).split('x')[-1] value = chr(ord(value) + 1) table[key] = value return table
def remove_text_from_normal_page(self, pg, pdf): content_object = pg["/Contents"].getObject() content = ContentStream(content_object, pdf) flag = False for operands, operator in content.operations: if operator in [b_('TJ'), b_('Tj')]: if type(operands[0]) is list: text = ''.join( map( lambda x: x if isinstance(x, TextStringObject) else '', operands[0])) else: text = operands[0] if isinstance(text, TextStringObject) and text.startswith( self.wmtext): operands[0] = TextStringObject('') flag = True pg[NameObject('/Contents')] = content if not flag and self.form: pg = self.remove_form_from_normal_page(pg) return pg
def customExtractText(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if i < -100: text += " " elif operator == b_("TD") or operator == b_("Tm"): if len(text) > 0 and text[-1] != " " and text[-1] != "\n": text += " " text = text.replace(" - ", "-") text = re.sub("\\s+", " ", text) return text
def alt_extractText(self): """ Locate text and include "\n" :return: a unicode string object. """ pic = "" tic = "~" text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + pic elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" else: text += tic return text
def extractText_alt_PageObject(self, Tj_sep="\n"): """ Try new-lines... :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += Tj_sep text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
pdf = PdfFileReader(pdf_file) pages_per_part = math.floor(pdf.getNumPages() / parts) rule_index = 0 current_text = '' current_color = 'White' current_rule_actions = [] overflowing = None is_on_speech_person_font = False for part in range(parts): script = base_script for i in range(1 + pages_per_part * part, pages_per_part * (part + 1)): page: PageObject = pdf.getPage(i) contentStream = ContentStream(page.getContents().getObject(), pdf) for operands, operator in contentStream.operations: if operator == b'Tf': is_on_speech_person_font = operands[0] == '/F1' elif operator == b'Tj': text = operands[0].strip() if text.strip() in ('', ':') or re.match( r'Page \d+/123', text): # is ignored continue elif re.match(r'^=*[A-Z0-9# ]+(?::|=+)$', text) or is_on_speech_person_font: # is speech person if len(current_rule_actions) != 0: current_rule_actions.append(None)
def pageOperations(page): obj = page.getContents().getObject() # Trigger decoding obj.getData() content = ContentStream(obj.decodedSelf, page.pdf) return contentOperations(content)
def removeWordStyle(self, ignoreByteStringObject=False): """ Removes imported styles from Word - Path Constructors rectangles - from this output. :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ pages = self.getObject(self._pages)['/Kids'] for j in range(len(pages)): page = pages[j] pageRef = self.getObject(page) content = pageRef['/Contents'].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, pageRef) _operations = [] last_font_size = 0 for operator_index, (operands, operator) in enumerate(content.operations): if operator == b_('Tf') and operands[0][:2] == '/F': last_font_size = operands[1].as_numeric() if operator == b_('Tj'): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_("'"): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[2] = TextStringObject() elif operator == b_("TJ"): for i in range(len(operands[0])): if ignoreByteStringObject: if not isinstance(operands[0][i], TextStringObject): operands[0][i] = TextStringObject() operator_type = self._getOperatorType(operator) # we are ignoring all grayscale colors # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors if operator_type == 'rgb' or operator_type == 'cmyk': color_target_operation_type = self._getColorTargetOperationType( operator_index, content.operations) new_color = None # we are coloring all text in black and all rectangles in white # removing all colors paints rectangles in black which gives us unwanted results if color_target_operation_type == 'text': new_color = 'black' elif color_target_operation_type == 'rectangle': new_color = 'white' if new_color: operands = self.colors_operands[operator_type][ new_color] # remove styled rectangles (highlights, lines, etc.) # the 're' operator is a Path Construction operator, creates a rectangle() # presumably, that's the way word embedding all of it's graphics into a PDF when creating one if operator == b_('re'): rectangle_width = operands[-2].as_numeric() rectangle_height = operands[-1].as_numeric() minWidth = self.getMinimumRectangleWidth( last_font_size, 1) # (length of X letters at the current size) maxHeight = last_font_size + 6 # range to catch really big highlights minHeight = 1.5 # so that thin lines will not be removed # remove only style that: # it's width are bigger than the minimum # it's height is smaller than maximum and larger than minimum if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight: continue _operations.append((operands, operator)) content.operations = _operations pageRef.__setitem__(NameObject('/Contents'), content)
""" loc = np.array([i.as_numeric() for i in location]) diff = np.abs(loc - target) return np.any(diff.max(axis=1) < epsilon) # 从结果看,MS-Word加的水印,有些指令混在正常数据中,需要更精细调试处理 with open(with_wm_path, 'rb') as f, open(nowm_out_path, 'wb') as f_out: pdf = PdfFileReader(f) pdf_out = PdfFileWriter() # print(pdf.getDocumentInfo()) cn_pages = pdf.getNumPages() for i in range(cn_pages): page = pdf.getPage(i) content = page.getContents() cs = ContentStream(content, pdf) for operands, operator in cs.operations: # `b_`只是python2/3中bytes类型转换的冗余代码 if operator == b_('Tm') and match_location(operands, TARGET_TXT): operands[:] = [] elif operator == b_('cm') and match_location(operands, TARGET_IMG): operands[:] = [] elif operator == b_('gs'): if operands[0] == '/GS0': operands[:] = [] elif operator == b_('Do'): # 引用图片名称 if operands[0] == '/Im0': pass elif operands[0] == '/Fm0': operands[:] = []
def extractText(self, skip_intertwined_text=True): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. # indent = 0 previous_width = 0 skip_next = False for operands, operator in content.operations: if not operands: # Empty operands list contributes no text operands = [""] if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): if skip_intertwined_text and skip_next: skip_next = False else: for i in operands[0]: if isinstance(i, TextStringObject): text += i previous_width += len(i) elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if text and (not text[-1] in " \n"): text += " " * int(i / -100) previous_width += int(i / -100) elif operator == b_("Td"): indent = indent + operands[0] if operands[1] == 0: if int(operands[0] / 20) >= previous_width: text += " " * (int(operands[0] / 20) - previous_width) else: skip_next = True # If skip_intertwined_text is false, this will result in no space between the two 'lines' else: previous_width = 0 text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20)) elif operator == b_("Tm"): indent = operands[4] text += " " * max(0, int(indent / 20)) elif operator == b_("TD") or operator == b_("Tm"): if text and (not text[-1] in " \n"): text += " " return text
def match_location(location, target, epsilon=1e-5): # targe must be n*6 numpy matrix return np.any( np.abs(np.array([i.as_numeric() for i in location]) - target).max(axis=1) < epsilon) for p in range(source.getNumPages()): page = source.getPage(p) # print(page.extractText()) #content_object, = page["/Contents"][0].getObject() content_object = page["/Contents"][1] content = ContentStream(content_object, source) for operands, operator in content.operations: # print(operator, operands) # pdf元素的类型和值 # 主要的代码在这里,使用各种方式找到水印可识别的特征 # if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码 # text = operands[0][0] # # if isinstance(text, bytes): # # print('==== ', text, ' ====') # # for c in guess_codes(text): # # print(c, text.decode(c)) # if isinstance(text, TextStringObject) and text in target_str: # operands[0] = TextStringObject('') if operator == b_("cm") and match_location(operands, target_locations): operands[:] = []
def extract_ops(page: PageObject) -> List[Tuple]: """extract all operators""" content = page.getContents() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) return list(content.operations)
def generateNup(inPathOrFile, n, outPathPatternOrFile=None, dirs="RD", verbose=False): """Generate a N-up document version. If outPathPatternOrFile is None, the output will be written in a file named after the input file. """ assert isSquare(n) or isHalfSquare(n) ipof = inPathOrFile oppof = outPathPatternOrFile if isFileLike(ipof): inFile = ipof if oppof is None: raise AssertionError("Must specify output for file input!") elif isFileLike(oppof): outFile = oppof elif type(oppof) in (str,): outPath = oppof outFile = open(outPath, "wb") elif type(ipof) in (str,): inFile = open(ipof, "rb") if isFileLike(oppof): outFile = oppof elif oppof is None or type(oppof) in (str,): if oppof is None: oppof = "%(dirname)s/%(base)s-%(n)dup%(ext)s" aDict = { "dirname": os.path.dirname(inPathOrFile) or ".", "basename": os.path.basename(inPathOrFile), "base": os.path.basename(os.path.splitext(inPathOrFile)[0]), "ext": os.path.splitext(inPathOrFile)[1], "n": n, } outPath = oppof % aDict outPath = os.path.normpath(outPath) outFile = open(outPath, "wb") # get info about source document docReader = PdfFileReader(inFile) numPages = docReader.getNumPages() oldPageSize = docReader.getPage(0).mediaBox.upperRight # create empty output document buffer if isSquare(n): newPageSize = oldPageSize elif isHalfSquare(n): newPageSize = oldPageSize[1], oldPageSize[0] np = numPages // n + numPages % n buf = exP1multiN(_mtA4Pdf, newPageSize, np) # calculate mini page areas rects = calcRects(newPageSize, n, dirs) # combine ops = [] newPageNum = -1 for i in range(numPages): if i % n == 0: newPageNum += 1 op = (inPathOrFile, i, (0, 0, None, None), i // n, rects[i % n]) ops.append(op) srcr = srcReader = PdfFileReader(inFile) srcPages = [srcr.getPage(i) for i in range(srcr.getNumPages())] if type(oppof) in (str,): outFile = open(outPath, "rb") outr = outReader = PdfFileReader(buf) outPages = [outr.getPage(i) for i in range(outr.getNumPages())] output = PdfFileWriter() mapping = {} for op in ops: dummy, dummy, dummy, destPageNum, dummy = op if destPageNum not in mapping: mapping[destPageNum] = [] mapping[destPageNum].append(op) PO, AO, DO, NO = PageObject, ArrayObject, DictionaryObject, NameObject for destPageNum, ops in list(mapping.items()): for op in ops: inPathOrFile, srcPageNum, srcRect, destPageNum, destRect = op page2 = srcPages[srcPageNum] page1 = outPages[destPageNum] pageWidth, pageHeight = page2.mediaBox.upperRight destX, destY, destWidth, destHeight = destRect xScale, yScale = calcScalingFactors( destWidth, destHeight, pageWidth, pageHeight) newResources = DO() rename = {} orgResources = page1["/Resources"].getObject() page2Resources = page2["/Resources"].getObject() names = "ExtGState Font XObject ColorSpace Pattern Shading" for res in names.split(): res = "/" + res new, newrename = PO._mergeResources(orgResources, page2Resources, res) if new: newResources[NO(res)] = new rename.update(newrename) newResources[NO("/ProcSet")] = AO( frozenset(orgResources.get("/ProcSet", AO()).getObject()).union( frozenset(page2Resources.get("/ProcSet", AO()).getObject()) ) ) newContentArray = AO() orgContent = page1["/Contents"].getObject() newContentArray.append(PO._pushPopGS(orgContent, page1.pdf)) page2Content = page2['/Contents'].getObject() page2Content = PO._contentStreamRename(page2Content, rename, page1.pdf) page2Content = ContentStream(page2Content, page1.pdf) page2Content.operations.insert(0, [[], "q"]) # handle rotation try: rotation = page2["/Rotate"].getObject() except KeyError: rotation = 0 if rotation in (180, 270): dw, dh = destWidth, destHeight arr = [-xScale, 0, 0, -yScale, destX + dw, destY + dh] elif rotation in (0, 90): arr = [xScale, 0, 0, yScale, destX, destY] else: # treat any other (illegal) rotation as 0 arr = [xScale, 0, 0, yScale, destX, destY] arr = [FloatObject(str(x)) for x in arr] page2Content.operations.insert(1, [arr, "cm"]) page2Content.operations.append([[], "Q"]) newContentArray.append(page2Content) page1[NO('/Contents')] = ContentStream(newContentArray, page1.pdf) page1[NO('/Resources')] = newResources output.addPage(page1) if type(oppof) in (str,): outFile = open(outPath, "wb") output.write(outFile) if verbose: if type(oppof) in (str,): print(("written: %s" % outPath)) elif isFileLike: print("written to file-like input parameter") return outPath
def update(filename, template_page): if not filename.lower().endswith(".pdf"): print(f"Not a pdf file: {filename}") return try: data = PdfFileReader(open(filename, "rb")) except OSError as e: print(f"Can't open {filename}: {e}") return except PdfReadError as e: print(f"{filename} is not a valid PDF file: {e}") return info = data.getDocumentInfo() producer = None creator = None title = None if info: producer = info.get("/Producer", None) creator = info.get("/Creator", None) title = info.get("/Title", None) # Check if we've already filled this if producer == "PyPDF2" or producer == SELF_PRODUCER: print(f"Skipping {filename}: Already added CMS-1500") return if data.getNumPages() < 1: print(f"Skipping {filename}: No pages") output = PdfFileWriter() output.addMetadata( {"/Producer": codecs.BOM_UTF16_BE + SELF_PRODUCER.encode("utf-16be")}) for page_no in range(data.getNumPages()): data_page = data.getPage(page_no) # If it's printed through the eBridge printer driver, it has an # image of the output with invisible text on top; look for those # and strip off the image if producer == "eBridgeToolkit 7.1": # Set a fixed-width font font = data_page[NameObject("/Resources")][NameObject("/Font")] if not NameObject("/T1_0") in font: print( f"Skipping {filename}: it does not match the expected format (font name)" ) return font[NameObject("/T1_0")][NameObject("/BaseFont")] = NameObject( "/Courier") # Remove the image that covers everything content = ContentStream(data_page["/Contents"].getObject(), data) ops = [op[1] for op in content.operations[0:5]] if ops != [b"q", b"cm", b"Do", b"Q", b"rg"]: print( f"Skipping {filename}: it does not match the expected format (obscuring image)" ) return del content.operations[0:5] # Remove the flag that makes the text hidden if content.operations[2] != ([3], b"Tr"): print( f"Skipping {filename}: it does not match the expected format (font invisible)" ) return del content.operations[2] # Write that back data_page[NameObject("/Contents")] = content elif creator == "Intergy" and title == "CMSPrePrinted1500212": # Nothing to do; these overlay just fine pass else: print(f"Skipping {filename}: Unknown PDF") return merged_page = copy.copy(template_page) merged_page.mergePage(data_page) output.addPage(merged_page) # Write the output to a temporary file, so that any failures in # writing don't affect the original output_file = NamedTemporaryFile() output.write(output_file) output_file.flush() shutil.copy(output_file.name, filename) print(f"Successfully processed {filename}")