def __get_font_settings(self, constant_option: str) -> list: """ :param constant_option: :return: """ font_settings = [] for font_name in self.fonts: page_font_settings = self.fonts[font_name] if constant_option in page_font_settings[utils.u_( '/BaseFont')].lower(): font_settings.append(utils.u_(font_name)) return font_settings
def getTextByPage(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text.replace("\n\n", "\n")
def convert_page_to_text(page): ''' This function will copied from PyPDF2 extractText method. ''' text = u_("") content = page.getContents() if not isinstance(content, ContentStream): content = ContentStream(content, page.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + ' ' elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] + ' ' elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text + ' ' elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i + ' ' text += "\n" return text
def process_content_object(self, objects): from PyPDF2.pdf import ContentStream content = ContentStream(objects, self.finder) last_id = None last_font = None if content is not None: for operands, operator in content.operations: text = u_("") curr_id = self.get_id(operands) if curr_id is not None: last_id = curr_id elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text if last_id is not None and text: self.table[last_id] += text self.strip_table_spaces()
def process_text_objects(self, operands, current_font): text = u_("") if operands: if isinstance(operands, list): items = operands[0] if isinstance(operands[0], list) else operands for item in items: text += self.process_text_object(item, current_font) else: text += self.process_text_object(operands, current_font) return text
def search(self): from PyPDF2.pdf import ContentStream for num in range(self.pdf.getNumPages()): page = self.pdf.getPage(num) self.converter.process_fonts(num, page) content = page["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, page) last_font = None last_x = None last_y = None re = None # re = rectangle for operands, operator in content.operations: text = u_("") if operator == b_("re"): re = operands elif operator == b_("Tf"): last_font = operands[0] elif operator == b_("Tj") or operator == b_("TJ"): text += self.converter.process_text_objects( operands, last_font) elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("Td"): # text coordinates last_x, last_y = operands elif operator == b_("cm"): # text coordinates *_, last_x, last_y = operands if text: # print(text) self.tables.process(re, text, last_x, last_y) # re = None CashObject().clean() return self.tables.get_tables()
def extractText_PageObject(self): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def process_text_object(self, item, current_font): def extract_text(self, item): item = item.hex() font = self.fonts.get(current_font, {}) elem_length = len(list(font.keys())[0]) if font else 1 items = [ item[index:index + elem_length] for index in range(0, len(item), elem_length) ] return ''.join([font.get(item, '?') for item in items]) text = u_("") if isinstance(item, TextStringObject): if self.text_to_hex: item = item.get_original_bytes() text = extract_text(self, item) else: text = item elif isinstance(item, ByteStringObject): text = extract_text(self, item) return text
def __extract(self, page: list) -> str: """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. operators (Tj, TJ etc) can be found in page 196 of PDF Reference :return: a unicode string object. """ text = utils.u_("") # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in page: if operator == utils.b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == utils.b_("T*"): text += "\n" elif operator == utils.b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == utils.b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == utils.b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def customExtractText(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if i < -100: text += " " elif operator == b_("TD") or operator == b_("Tm"): if len(text) > 0 and text[-1] != " " and text[-1] != "\n": text += " " text = text.replace(" - ", "-") text = re.sub("\\s+", " ", text) return text
def extractText_alt_PageObject(self, Tj_sep="\n"): """ Try new-lines... :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += Tj_sep text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def alt_extractText(self): """ Locate text and include "\n" :return: a unicode string object. """ pic = "" tic = "~" text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + pic elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" else: text += tic return text
def extractText(self, skip_intertwined_text=True): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. # indent = 0 previous_width = 0 skip_next = False for operands, operator in content.operations: if not operands: # Empty operands list contributes no text operands = [""] if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): if skip_intertwined_text and skip_next: skip_next = False else: for i in operands[0]: if isinstance(i, TextStringObject): text += i previous_width += len(i) elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if text and (not text[-1] in " \n"): text += " " * int(i / -100) previous_width += int(i / -100) elif operator == b_("Td"): indent = indent + operands[0] if operands[1] == 0: if int(operands[0] / 20) >= previous_width: text += " " * (int(operands[0] / 20) - previous_width) else: skip_next = True # If skip_intertwined_text is false, this will result in no space between the two 'lines' else: previous_width = 0 text += "\n" * max(0, int(operands[1] / -50)) + " " * max(0, int(indent / 20)) elif operator == b_("Tm"): indent = operands[4] text += " " * max(0, int(indent / 20)) elif operator == b_("TD") or operator == b_("Tm"): if text and (not text[-1] in " \n"): text += " " return text