def extract_text_objects(page): """Yields a sequence of TextStringObject instances from a given PageObject, in whatever order the internal content stream chooses to emit them. Note that the order may change as the PyPDF2 package evolves. Adapted directly from the extractText method of the PageObject class from PyPDF2.pdf.""" content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_("T*"): yield "\n" elif operator == b_("'"): yield "\n" _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): yield "\n" yield _text elif operator == b_("TJ"): for x in operands[0]: if isinstance(x, TextStringObject): yield x yield "\n"
def convert_page_to_text(page): ''' This function will copied from PyPDF2 extractText method. ''' text = u_("") content = page.getContents() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + ' ' elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] + ' ' elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text + ' ' elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i + ' ' text += "\n" return text
def getTextByPage(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text.replace("\n\n", "\n")
def extract_text_objects(page): """Yields a sequence of TextStringObject instances from a given PageObject, in whatever order the internal content stream chooses to emit them. Note that the order may change as the PyPDF2 package evolves. Adapted directly from the extractText method of the PageObject class from PyPDF2.pdf.""" content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_("T*"): yield "\n" elif operator == b_("'"): yield "\n" _text = operands[0] if isinstance(_text, TextStringObject): yield _text elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): yield "\n" yield _text elif operator == b_("TJ"): yield blockify(operands[0]) # for x in operands[0]: # if isinstance(x, TextStringObject): # yield x yield "\n"
def _evaluateColor3(self, operator, operands, color): if operator == b_("sc") or operator == b_("rg"): red = int(operands[0] * 255) green = int(operands[1] * 255) blue = int(operands[2] * 255) if red == color.red and green == color.green and blue == color.blue: return True return False
def process_content(self): for page_num in range(self.reader.getNumPages()): page = self.reader.getPage(page_num) content_object = page["/Contents"].getObject() content = ContentStream(content_object, self.reader) for operands, operator in content.operations: if operator == b_("TJ") or operator == b_("Tj"): text = operands[0] if any_match(text, self.remove_list): print(text) operands[0] = TextStringObject('') page.__setitem__(NameObject('/Contents'), content) self.writer.addPage(page)
def _evaluateColor4(self, operator, operands, color): if operator == b_("sc") or operator == b_("k"): cyan = operands[0] magenta = operands[1] yellow = operands[2] black = operands[3] (red, green, blue) = cmykToRGB(cyan, magenta, yellow, black) red = int(red * 255) green = int(green * 255) blue = int(blue * 255) if red == color.red and green == color.green and blue == color.blue: return True return False
def search(self): from PyPDF2.pdf import ContentStream for num in range(self.pdf.getNumPages()): page = self.pdf.getPage(num) self.converter.process_fonts(num, page) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page) last_font = None last_x = None last_y = None re = None # re = rectangle for operands, operator in content.operations: text = u_("") if operator == b_("re"): re = operands elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("Td"): # text coordinates last_x, last_y = operands elif operator == b_("cm"): # text coordinates *_, last_x, last_y = operands if text: # print(text) self.tables.process(re, text, last_x, last_y) # re = None CashObject().clean() return self.tables.get_tables()
def _swapColorCmd(self, content, index, toColor): redObj = FloatObject((toColor.red / 255.0)) greenObj = FloatObject((toColor.green / 255.0)) blueObj = FloatObject((toColor.blue / 255.0)) operator = b_("rg") obj = ([redObj, greenObj, blueObj], operator) content.operations.pop(index) content.operations.insert(index, obj) return
def build_pdf_preview( self, file_path: str, preview_name: str, cache_path: str, extension: str = ".pdf", page_id: int = -1, mimetype: str = "", ) -> None: intermediate_pdf_filename = preview_name.split("-page")[0] + ".pdf" intermediate_pdf_file_path = os.path.join(cache_path, intermediate_pdf_filename) if not os.path.exists(intermediate_pdf_file_path): if os.path.exists(intermediate_pdf_file_path + "_flag"): # Wait 2 seconds, then retry # Info - B.L - 2018/09/28 - Protection for concurent file access # If two person try to preview the same file one will override the file # while the other is reading it. time.sleep(2) return self.build_pdf_preview( file_path=file_path, preview_name=preview_name, cache_path=cache_path, extension=extension, page_id=page_id, mimetype=mimetype, ) with open(file_path, "rb") as input_stream: input_extension = os.path.splitext(file_path)[1] # first step is to convert full document to full pdf self._convert_to_pdf( file_content=input_stream, input_extension=input_extension, cache_path=cache_path, output_filepath=intermediate_pdf_file_path, mimetype=mimetype, ) if page_id < 0: return # in this case, the intermediate file is the requested one pdf_out = PdfFileWriter() with open(intermediate_pdf_file_path, "rb") as pdf_stream: # HACK - G.M - 2020-08-19 - Transform stream in a way pypdf2 can handle it # this should be removed with a future pdf builder. stream = BytesIO(b_(pdf_stream.read())) pdf_in = utils.get_decrypted_pdf(stream) output_file_path = os.path.join( cache_path, "{}{}".format(preview_name, extension)) pdf_out.addPage(pdf_in.getPage(page_id)) with open(output_file_path, "wb") as output_file: pdf_out.write(output_file)
def extractText_PageObject(self): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def extract_text_blocks(self) -> list: """ Every text block in pdf begins with BT (Begin text) and ends with ET (end text) Get what is in between and return it. :return: """ text = [] content = self.pageObject["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pageObject.pdf) start, end = 0, 0 for index, (operands, operator) in enumerate(content.operations): if operator == utils.b_("BT"): start = index if operator == utils.b_("ET"): end = index if start != 0 and end != 0: text.append(content.operations[start + 1:end]) start, end = 0, 0 return text
def process_content_object(self, objects): from PyPDF2.pdf import ContentStream content = ContentStream(objects, self.finder) last_id = None last_font = None if content is not None: for operands, operator in content.operations: text = u_("") curr_id = self.get_id(operands) if curr_id is not None: last_id = curr_id elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text if last_id is not None and text: self.table[last_id] += text self.strip_table_spaces()
def _xobj_to_image(x_object_obj): """ Users need to have the pillow package installed. It's unclear if PyPDF2 will keep this function here, hence it's private. It might get removed at any point. :return: Tuple[file extension, bytes] """ import io from PIL import Image from PyPDF2.constants import GraphicsStateParameters as G size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) data = x_object_obj.getData() if x_object_obj[IA.COLOR_SPACE] == ColorSpaces.DEVICE_RGB: mode = "RGB" else: mode = "P" extension = None if SA.FILTER in x_object_obj: if x_object_obj[SA.FILTER] == FT.FLATE_DECODE: extension = ".png" img = Image.frombytes(mode, size, data) if G.S_MASK in x_object_obj: # add alpha channel alpha = Image.frombytes("L", size, x_object_obj[G.S_MASK].getData()) img.putalpha(alpha) img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() elif x_object_obj[SA.FILTER] in ([FT.LZW_DECODE], [FT.ASCII_85_DECODE], [FT.CCITT_FAX_DECODE]): from PyPDF2.utils import b_ extension = ".png" data = b_(data) elif x_object_obj[SA.FILTER] == FT.DCT_DECODE: extension = ".jpg" elif x_object_obj[SA.FILTER] == "/JPXDecode": extension = ".jp2" elif x_object_obj[SA.FILTER] == FT.CCITT_FAX_DECODE: extension = ".tiff" else: extension = ".png" img = Image.frombytes(mode, size, data) img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") data = img_byte_arr.getvalue() return extension, data
def remove_text_from_normal_page(self, pg, pdf): content_object = pg["/Contents"].getObject() content = ContentStream(content_object, pdf) flag = False for operands, operator in content.operations: if operator in [b_('TJ'), b_('Tj')]: if type(operands[0]) is list: text = ''.join( map( lambda x: x if isinstance(x, TextStringObject) else '', operands[0])) else: text = operands[0] if isinstance(text, TextStringObject) and text.startswith( self.wmtext): operands[0] = TextStringObject('') flag = True pg[NameObject('/Contents')] = content if not flag and self.form: pg = self.remove_form_from_normal_page(pg) return pg
def __extract(self, page: list) -> str: """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. operators (Tj, TJ etc) can be found in page 196 of PDF Reference :return: a unicode string object. """ text = utils.u_("") # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in page: if operator == utils.b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == utils.b_("T*"): text += "\n" elif operator == utils.b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == utils.b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == utils.b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def extractText_alt_PageObject(self, Tj_sep="\n"): """ Try new-lines... :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += Tj_sep text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def removeWatermark(self): #for pageNum in range(self.pdfObj.getNumPages()): print(self.pdfObj.getDocumentInfo()) for pageNum in range(399, 400): page = self.pdfObj.getPage(pageNum) print(page.extractText().encode('latin-1')) contentObj = page["/Contents"].getObject() content = ContentStream(contentObj, self.pdfObj) for opr, opt in content.operations: if opt == b_("TJ"): txt = opr[0][0] if isinstance(txt, TextStringObject): pass page.__setitem__(NameObject('/Contents'), content) print("\n\n")
def swapColor(self, pageIndex, fromColor, toColor): """ Substitutes all the color switching operators with fromColor with toColor. :param pageIndex: index of evaluated page :param fromColor: color which will be substituted :param toColor: destination color :return: """ if pageIndex >= self.getNumPages(): print("That page doesn't exist") return print("Evaluating page no. %d..." % pageIndex) page = self.getPage(pageIndex) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) swap_counter = 0 for index, val in enumerate(content.operations): operands = val[0] operator = val[1] should_swap = False if operator == b_("cs"): self.printDebug("Nonstroking color space.") elif operator in self.operators: if len(operands) == 3: should_swap = self._evaluateColor3(operator, operands, fromColor) elif len(operands) == 1: should_swap = self._evaluateColor1(operator, operands, fromColor) elif len(operands) == 4: should_swap = self._evaluateColor4(operator, operands, fromColor) # evaluating should swap if should_swap: swap_counter += 1 if self._removeCSRef(content, index): self._swapColorCmd(content, index - 1, toColor) else: self._swapColorCmd(content, index, toColor) key = NameObject("/Contents") page[key] = content page.compressContentStreams() print("Replaced %d references of given color.\n" % swap_counter)
def get_pdf_content(path): # Load PDF into pyPDF pdf, stdout, stderr = capture(PdfFileReader, open(path, "rb")) if len(stderr) > 0: logger.warning(stderr[:-1]) # The contents will end up in this list lines = [] # This vodoo code is taken from the extractText method of the PageObject. content = pdf.getPage(0)["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, pdf) for operands, operator in content.operations: if operator == b_("Tj"): text = operands[0] if isinstance(text, TextStringObject): lines.append(text) return lines
def __filter_blocks(self, blocks: list, options: list, included: bool) -> list: """ Filters blocks of text against a list of options and returns a string e.g. a bold font that matches. :param blocks: the list of text with its formatting options :param options: the options to compare against :param included: if we want to include or exclude the text. :return: """ desired_blocks = [] for block in blocks: matches = False for operands, operator in block: if operator == utils.b_("Tf"): if included: matches = operands[0] in options if not included: matches = operands[0] not in options break if matches: desired_blocks.append(self.__extract(block)) return desired_blocks
def generate_facturx_from_file( pdf_invoice, facturx_xml, facturx_level='autodetect', check_xsd=True, pdf_metadata=None, output_pdf_file=None, additional_attachments=None): """ Generate a Factur-X invoice from a regular PDF invoice and a factur-X XML file. The method uses a file as input (regular PDF invoice) and re-writes the file (Factur-X PDF invoice). :param pdf_invoice: the regular PDF invoice as file path (type string) or as file object :type pdf_invoice: string or file :param facturx_xml: the Factur-X XML :type facturx_xml: string, file or etree object :param facturx_level: the level of the Factur-X XML file. Default value is 'autodetect'. The only advantage to specifiy a particular value instead of using the autodetection is for a very very small perf improvement. Possible values: minimum, basicwl, basic, en16931. :type facturx_level: string :param check_xsd: if enable, checks the Factur-X XML file against the XSD (XML Schema Definition). If this step has already been performed beforehand, you should disable this feature to avoid a double check and get a small performance improvement. :type check_xsd: boolean :param pdf_metadata: Specify the metadata of the generated Factur-X PDF. If pdf_metadata is None (default value), this lib will generate some metadata in English by extracting relevant info from the Factur-X XML. Here is an example for the pdf_metadata argument: pdf_metadata = { 'author': 'Akretion', 'keywords': 'Factur-X, Invoice', 'title': 'Akretion: Invoice I1242', 'subject': 'Factur-X invoice I1242 dated 2017-08-17 issued by Akretion', } If you pass the pdf_metadata argument, you will not use the automatic generation based on the extraction of the Factur-X XML file, which will bring a very small perf improvement. :type pdf_metadata: dict :param output_pdf_file: File Path to the output Factur-X PDF file :type output_pdf_file: string or unicode :param additional_attachments: Specify the other files that you want to embed in the PDF file. It is a dict where keys are filepath and value is the description of the file (as unicode or string). :type additional_attachments: dict :return: Returns True. This method re-writes the input PDF invoice file, unless if the output_pdf_file is provided. :rtype: bool """ start_chrono = datetime.now() logger.debug('1st arg pdf_invoice type=%s', type(pdf_invoice)) logger.debug('2nd arg facturx_xml type=%s', type(facturx_xml)) logger.debug('optional arg facturx_level=%s', facturx_level) logger.debug('optional arg check_xsd=%s', check_xsd) logger.debug('optional arg pdf_metadata=%s', pdf_metadata) logger.debug( 'optional arg additional_attachments=%s', additional_attachments) if not pdf_invoice: raise ValueError('Missing pdf_invoice argument') if not facturx_xml: raise ValueError('Missing facturx_xml argument') if not isinstance(facturx_level, (str, unicode)): raise ValueError('Wrong facturx_level argument') if not isinstance(check_xsd, bool): raise ValueError('check_xsd argument must be a boolean') if not isinstance(pdf_metadata, (type(None), dict)): raise ValueError('pdf_metadata argument must be a dict or None') if not isinstance(pdf_metadata, (dict, type(None))): raise ValueError('pdf_metadata argument must be a dict or None') if not isinstance(additional_attachments, (dict, type(None))): raise ValueError( 'additional_attachments argument must be a dict or None') if not isinstance(output_pdf_file, (type(None), str, unicode)): raise ValueError('output_pdf_file argument must be a string or None') if isinstance(pdf_invoice, (str, unicode)): file_type = 'path' else: file_type = 'file' xml_root = None if isinstance(facturx_xml, str): xml_string = facturx_xml elif isinstance(facturx_xml, unicode): xml_string = facturx_xml.encode('utf8') elif isinstance(facturx_xml, type(etree.Element('pouet'))): xml_root = facturx_xml xml_string = etree.tostring( xml_root, pretty_print=True, encoding='UTF-8', xml_declaration=True) elif isinstance(facturx_xml, file): facturx_xml.seek(0) xml_string = facturx_xml.read() facturx_xml.close() else: raise TypeError( "The second argument of the method generate_facturx must be " "either a string, an etree.Element() object or a file " "(it is a %s)." % type(facturx_xml)) additional_attachments_read = {} if additional_attachments: for attach_filepath, attach_desc in additional_attachments.items(): filename = os.path.basename(attach_filepath) mod_timestamp = os.path.getmtime(attach_filepath) mod_dt = datetime.fromtimestamp(mod_timestamp) with open(attach_filepath, 'r') as fa: fa.seek(0) additional_attachments_read[fa.read()] = { 'filename': filename, 'desc': attach_desc, 'mod_date': mod_dt, } fa.close() if pdf_metadata is None: if xml_root is None: xml_root = etree.fromstring(xml_string) base_info = _extract_base_info(xml_root) pdf_metadata = _base_info2pdf_metadata(base_info) else: # clean-up pdf_metadata dict for key, value in pdf_metadata.iteritems(): if not isinstance(value, (str, unicode)): pdf_metadata[key] = '' facturx_level = facturx_level.lower() if facturx_level not in FACTURX_LEVEL2xsd: if xml_root is None: xml_root = etree.fromstring(xml_string) logger.debug('Factur-X level will be autodetected') facturx_level = get_facturx_level(xml_root) if check_xsd: check_facturx_xsd( xml_string, flavor='factur-x', facturx_level=facturx_level) original_pdf = PdfFileReader(pdf_invoice) # Extract /OutputIntents obj from original invoice output_intents = _get_original_output_intents(original_pdf) new_pdf_filestream = PdfFileWriter() new_pdf_filestream._header = b_("%PDF-1.6") new_pdf_filestream.appendPagesFromReader(original_pdf) original_pdf_id = original_pdf.trailer.get('/ID') logger.debug('original_pdf_id=%s', original_pdf_id) if original_pdf_id: new_pdf_filestream._ID = original_pdf_id # else : generate some ? _facturx_update_metadata_add_attachment( new_pdf_filestream, xml_string, pdf_metadata, facturx_level, output_intents=output_intents, additional_attachments=additional_attachments_read) if output_pdf_file: with open(output_pdf_file, 'wb') as output_f: new_pdf_filestream.write(output_f) output_f.close() else: if file_type == 'path': with open(pdf_invoice, 'wb') as f: new_pdf_filestream.write(f) f.close() elif file_type == 'file': new_pdf_filestream.write(pdf_invoice) logger.info('%s file added to PDF invoice', FACTURX_FILENAME) end_chrono = datetime.now() logger.info( 'Factur-X invoice generated in %s seconds', (end_chrono - start_chrono).total_seconds()) return True
def removeWordStyle(self, ignoreByteStringObject=False): """ Removes imported styles from Word - Path Constructors rectangles - from this output. :param bool ignoreByteStringObject: optional parameter to ignore ByteString Objects. """ pages = self.getObject(self._pages)['/Kids'] for j in range(len(pages)): page = pages[j] pageRef = self.getObject(page) content = pageRef['/Contents'].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, pageRef) _operations = [] last_font_size = 0 for operator_index, (operands, operator) in enumerate(content.operations): if operator == b_('Tf') and operands[0][:2] == '/F': last_font_size = operands[1].as_numeric() if operator == b_('Tj'): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_("'"): text = operands[0] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] if ignoreByteStringObject: if not isinstance(text, TextStringObject): operands[2] = TextStringObject() elif operator == b_("TJ"): for i in range(len(operands[0])): if ignoreByteStringObject: if not isinstance(operands[0][i], TextStringObject): operands[0][i] = TextStringObject() operator_type = self._getOperatorType(operator) # we are ignoring all grayscale colors # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors if operator_type == 'rgb' or operator_type == 'cmyk': color_target_operation_type = self._getColorTargetOperationType( operator_index, content.operations) new_color = None # we are coloring all text in black and all rectangles in white # removing all colors paints rectangles in black which gives us unwanted results if color_target_operation_type == 'text': new_color = 'black' elif color_target_operation_type == 'rectangle': new_color = 'white' if new_color: operands = self.colors_operands[operator_type][ new_color] # remove styled rectangles (highlights, lines, etc.) # the 're' operator is a Path Construction operator, creates a rectangle() # presumably, that's the way word embedding all of it's graphics into a PDF when creating one if operator == b_('re'): rectangle_width = operands[-2].as_numeric() rectangle_height = operands[-1].as_numeric() minWidth = self.getMinimumRectangleWidth( last_font_size, 1) # (length of X letters at the current size) maxHeight = last_font_size + 6 # range to catch really big highlights minHeight = 1.5 # so that thin lines will not be removed # remove only style that: # it's width are bigger than the minimum # it's height is smaller than maximum and larger than minimum if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight: continue _operations.append((operands, operator)) content.operations = _operations pageRef.__setitem__(NameObject('/Contents'), content)
def _getOperatorType(self, operator): operator_types = { b_('Tj'): 'text', b_("'"): 'text', b_('"'): 'text', b_("TJ"): 'text', b_('rg'): 'rgb', # color b_('RG'): 'rgb', # color b_('k'): 'cmyk', # color b_('K'): 'cmyk', # color b_('g'): 'grayscale', # color b_('G'): 'grayscale', # color b_('re'): 'rectangle', b_('l'): 'line', # line b_('m'): 'line', # start line b_('S'): 'line', # stroke(paint) line } if operator in operator_types: return operator_types[operator] return None
return np.any(diff.max(axis=1) < epsilon) # 从结果看,MS-Word加的水印,有些指令混在正常数据中,需要更精细调试处理 with open(with_wm_path, 'rb') as f, open(nowm_out_path, 'wb') as f_out: pdf = PdfFileReader(f) pdf_out = PdfFileWriter() # print(pdf.getDocumentInfo()) cn_pages = pdf.getNumPages() for i in range(cn_pages): page = pdf.getPage(i) content = page.getContents() cs = ContentStream(content, pdf) for operands, operator in cs.operations: # `b_`只是python2/3中bytes类型转换的冗余代码 if operator == b_('Tm') and match_location(operands, TARGET_TXT): operands[:] = [] elif operator == b_('cm') and match_location(operands, TARGET_IMG): operands[:] = [] elif operator == b_('gs'): if operands[0] == '/GS0': operands[:] = [] elif operator == b_('Do'): # 引用图片名称 if operands[0] == '/Im0': pass elif operands[0] == '/Fm0': operands[:] = [] page.__setitem__(NameObject('/Contents'), cs) pdf_out.addPage(page) pdf_out.write(f_out)
def extractText(self, skip_intertwined_text=True): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. # indent = 0 previous_width = 0 skip_next = False for operands, operator in content.operations: if not operands: # Empty operands list contributes no text operands = [""] if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): if skip_intertwined_text and skip_next: skip_next = False else: for i in operands[0]: if isinstance(i, TextStringObject): text += i previous_width += len(i) elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if text and (not text[-1] in " \n"): text += " " * int(i / -100) previous_width += int(i / -100) elif operator == b_("Td"): indent = indent + operands[0] if operands[1] == 0: if int(operands[0] / 20) >= previous_width: text += " " * (int(operands[0] / 20) - previous_width) else: skip_next = True # If skip_intertwined_text is false, this will result in no space between the two 'lines' else: previous_width = 0 text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20)) elif operator == b_("Tm"): indent = operands[4] text += " " * max(0, int(indent / 20)) elif operator == b_("TD") or operator == b_("Tm"): if text and (not text[-1] in " \n"): text += " " return text
def _evaluateColor1(self, operator, operands, color): if operator == b_("sc") or operator == b_("g"): grey = int(operands[0] * 255) if grey == color.red and grey == color.green and grey == color.blue: return True return False
""" # Copyright (c) 2020 Ben Zimmer. All rights reserved. import sys from typing import List, Tuple, Optional import PyPDF2 as pdf from PyPDF2.utils import b_ from PyPDF2.pdf import ContentStream, PageObject DEBUG = False TEXT_POSITION_OPS = [ # page 310 b_(x) for x in ["Td", "TD", "Tm", "T*"] ] TEXT_SHOW_OPS = [ # page 311 b_(x) for x in ["TJ", "'", "\"", "TJ"] ] SPACING_EXPECTED = {-14.445, -14.446} START_EXPECTED = { 528.045, # normal pages # 400.95 # chapter title pages 402.143 # chapter title pages } FONT_WEIGHT_MAPPING = {"/F34": "regular", "/F32": "bold"}
class PdfColorConverter(PdfFileWriter): operators = [b_("sc"), b_("rg"), b_("g"), b_("k")] def __init__(self, debug=False): PdfFileWriter.__init__(self) self.debug = debug def printDebug(self, str): if self.debug: print(str) return def swapColor(self, pageIndex, fromColor, toColor): """ Substitutes all the color switching operators with fromColor with toColor. :param pageIndex: index of evaluated page :param fromColor: color which will be substituted :param toColor: destination color :return: """ if pageIndex >= self.getNumPages(): print("That page doesn't exist") return print("Evaluating page no. %d..." % pageIndex) page = self.getPage(pageIndex) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) swap_counter = 0 for index, val in enumerate(content.operations): operands = val[0] operator = val[1] should_swap = False if operator == b_("cs"): self.printDebug("Nonstroking color space.") elif operator in self.operators: if len(operands) == 3: should_swap = self._evaluateColor3(operator, operands, fromColor) elif len(operands) == 1: should_swap = self._evaluateColor1(operator, operands, fromColor) elif len(operands) == 4: should_swap = self._evaluateColor4(operator, operands, fromColor) # evaluating should swap if should_swap: swap_counter += 1 if self._removeCSRef(content, index): self._swapColorCmd(content, index - 1, toColor) else: self._swapColorCmd(content, index, toColor) key = NameObject("/Contents") page[key] = content page.compressContentStreams() print("Replaced %d references of given color.\n" % swap_counter) # removing operator that switches color space def _removeCSRef(self, content, index): if index >= 1 and content.operations[index - 1][1] == b_("cs"): content.operations.pop(index - 1) self.printDebug("Removing CS") return True return False def _swapColorCmd(self, content, index, toColor): redObj = FloatObject((toColor.red / 255.0)) greenObj = FloatObject((toColor.green / 255.0)) blueObj = FloatObject((toColor.blue / 255.0)) operator = b_("rg") obj = ([redObj, greenObj, blueObj], operator) content.operations.pop(index) content.operations.insert(index, obj) return def _evaluateColor3(self, operator, operands, color): if operator == b_("sc") or operator == b_("rg"): red = int(operands[0] * 255) green = int(operands[1] * 255) blue = int(operands[2] * 255) if red == color.red and green == color.green and blue == color.blue: return True return False def _evaluateColor1(self, operator, operands, color): if operator == b_("sc") or operator == b_("g"): grey = int(operands[0] * 255) if grey == color.red and grey == color.green and grey == color.blue: return True return False def _evaluateColor4(self, operator, operands, color): if operator == b_("sc") or operator == b_("k"): cyan = operands[0] magenta = operands[1] yellow = operands[2] black = operands[3] (red, green, blue) = cmykToRGB(cyan, magenta, yellow, black) red = int(red * 255) green = int(green * 255) blue = int(blue * 255) if red == color.red and green == color.green and blue == color.blue: return True return False
def _removeCSRef(self, content, index): if index >= 1 and content.operations[index - 1][1] == b_("cs"): content.operations.pop(index - 1) self.printDebug("Removing CS") return True return False
for p in range(source.getNumPages()): page = source.getPage(p) # print(page.extractText()) #content_object, = page["/Contents"][0].getObject() content_object = page["/Contents"][1] content = ContentStream(content_object, source) for operands, operator in content.operations: # print(operator, operands) # pdf元素的类型和值 # 主要的代码在这里,使用各种方式找到水印可识别的特征 # if operator == b_("TJ"): # `b_`只是python2/3中bytes类型转换的冗余代码 # text = operands[0][0] # # if isinstance(text, bytes): # # print('==== ', text, ' ====') # # for c in guess_codes(text): # # print(c, text.decode(c)) # if isinstance(text, TextStringObject) and text in target_str: # operands[0] = TextStringObject('') if operator == b_("cm") and match_location(operands, target_locations): operands[:] = [] page.__setitem__(NameObject('/Contents'), content) output.addPage(page) outputStream = open("tmp/output.pdf", "wb") output.write(outputStream) outputStream.close() file.close()