def extractTextList(self): text_list = [] content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject) and len(_text.strip()): text_list.append(_text.strip()) elif operator == b_("T*"): pass elif operator == b_("'"): pass _text = operands[0] if isinstance(_text, TextStringObject) and len(operands[0]): text_list.append(operands[0]) elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject) and len(_text): text_list.append(_text) elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject) and len(i): text_list.append(i) return text_list
def is_continuation(content, item): if content.operations[item - 1][1] == b_("Tm"): # Search previous "Tm" for bef in range(-2, -15, -1): try: if content.operations[item - bef][1] == b_("Tm"): prev_val = content.operations[item - bef][0] break except IndexError: return False else: return False key_1_preve = '{0:.5f}'.format(prev_val[4]).split(".")[1] key_2_preve = '{0:.5f}'.format(prev_val[5]).split(".")[1] prev_curr = content.operations[item - 1][0] key_1_curr = '{0:.5f}'.format(prev_curr[4]).split(".")[1] key_2_curr = '{0:.5f}'.format(prev_curr[5]).split(".")[1] # if key_1_curr != key_1_preve or key_2_curr != key_2_preve: if key_1_curr == key_1_preve: return True return False
def original_extractText(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" return text
def is_header(content, item): if content.operations[item - 1][1] == b_("Td"): return True elif content.operations[item - 1][1] == b_("Tm") and \ content.operations[item - 2][1] == b_("Tf"): if content.operations[item - 3][1] == b_("BT") or \ content.operations[item - 3][1] == b_("scn"): return True else: return False else: return False
def extractText_with_separator(self, remove_headers=False): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for item, (operands, operator) in enumerate(content.operations): if operator == b_("Tj"): # Skip headers? if is_header(content, item): continue if not is_continuation(content, item): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): # Skip headers? if is_header(content, item): continue if not is_continuation(content, item): text += "\n" for i in operands[0]: if isinstance(i, TextStringObject): text += i # text += "\n" return text
def extractText_patch(self): """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated. :return: a unicode string object. """ text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i elif isinstance(i, NumberObject) and i < -125: text += " " text += "\n" return text
def alt_extractText(self): """ Locate text and include "\n" :return: a unicode string object. """ pic = "" tic = "~" text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text + pic elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i text += "\n" else: text += tic return text
def customExtractText(self): text = u_("") content = self["/Contents"].getObject() if not isinstance(content, ContentStream): content = ContentStream(content, self.pdf) # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. for operands, operator in content.operations: if operator == b_("Tj"): _text = operands[0] if isinstance(_text, TextStringObject): text += _text elif operator == b_("T*"): text += "\n" elif operator == b_("'"): text += "\n" _text = operands[0] if isinstance(_text, TextStringObject): text += operands[0] elif operator == b_('"'): _text = operands[2] if isinstance(_text, TextStringObject): text += "\n" text += _text elif operator == b_("TJ"): for i in operands[0]: if isinstance(i, TextStringObject): text += i elif isinstance(i, FloatObject) or isinstance(i, NumberObject): if i < -100: text += " " elif operator == b_("TD") or operator == b_("Tm"): if len(text) > 0 and text[-1] != " " and text[-1] != "\n": text += " " text = text.replace(" - ", "-") text = re.sub("\\s+", " ", text) return text