def text_parsing_example(): pdf_filepath = './example-text-crash-report.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd) viewer.render() markdown = viewer.canvas.text_content print('markdown = {}.'.format(markdown)) print('viewer.canvas.strings = {}.'.format(viewer.canvas.strings)) # Parse PDF markdown. print('isinstance(markdown, str) = {}.'.format(isinstance(markdown, str))) with open('./example-crash-markdown.txt', 'w') as fd2: fd2.write(markdown) # Now we may use any text processing tools like regular expressions, grep, custom parsers to extract the data. reporting_agency = markdown.split('(REPORTING AGENCY NAME *)', 1)[1].split('(', 1)[1].split(')',1)[0] print('reporting_agency = {}.'.format(reporting_agency)) local_report_number = markdown.split('(LOCAL REPORT NUMBER *)', 1)[1].split('(', 1)[1].split(')',1)[0] print('local_report_number = {}.'.format(local_report_number)) crash_severity = markdown.split('( ERROR)', 1)[1].split('(', 1)[1].split(')',1)[0] print('crash_severity = {}.'.format(crash_severity)) finally: fd.close()
def parse_vaccinations(filename): # Read pdf (for metrics) with open(filename, mode="rb") as f: viewer = SimplePDFViewer(f) viewer.render() # Get list with strings strs = viewer.canvas.strings # Infer figures numbers = [] for str in strs: try: numbers.append(clean_count(str)) except: pass numbers.sort() total_vaccinations = numbers[-1] people_vaccinated = numbers[-2] people_fully_vaccinated = numbers[-3] total_boosters = numbers[-4] # Sanity check if people_vaccinated + people_fully_vaccinated + total_boosters != total_vaccinations: raise ValueError( f"people_vaccinated + people_fully_vaccinated + total_boosters != total_vaccinations ({people_vaccinated} + {people_fully_vaccinated} + {total_boosters} != {total_vaccinations})" ) return total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters
def __init__(self, pdf_path): self.pdf_path = pdf_path fd = open(pdf_path, "rb") doc = PDFDocument(fd) self.viewer = SimplePDFViewer(fd) self.pages = [p for p in doc.pages()]
def GetSiteKeys(): pwd = pathlib.Path().absolute() # INVOICE PDF WORK STARTS HERE invoice_directory = '%s/Invoices' % (pwd) invoice_list = os.listdir(invoice_directory) WorkIDList = ['372856'] InvoiceInfo = {} print(invoice_list) for invoice in invoice_list: pdf = "%s/Invoices/%s" % (pwd, invoice) fd = open(pdf, "rb") viewer = SimplePDFViewer(fd) viewer.render() raw_invoice_data = viewer.canvas.strings xyz = listToString(raw_invoice_data) xyz = xyz.split(' ') try: WorkOrderIndex = xyz.index('Order') + 1 WorkOrder = xyz[WorkOrderIndex].split('-')[1] WorkIDList.append(WorkOrder) except: pass # print(xyz) print("Work Orders", WorkIDList) return WorkIDList
class PDFPageIterator: def __init__(self, filename): self._pdf_viewer = None self._page_number = 0 self._rendered = False if filename: self._pdf_viewer = SimplePDFViewer(open(filename, 'rb')) def __iter__(self): return self def __next__(self): try: self._go_to_next_pdf_page() return self except PageDoesNotExist as e: raise StopIteration(e) def get_page_number(self): return self._page_number def get_strings(self): if not self._rendered: self._pdf_viewer.render() self._rendered = True return self._pdf_viewer.canvas.strings def _go_to_next_pdf_page(self): if self._page_number != 0: self._pdf_viewer.next() self._page_number += 1 self._rendered = False
def encrypted_and_password_protected_pdf_tutorial(): pdf_filepath = './encrypted-with-qwerty.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd, password='******') viewer.render() text = ''.join(viewer.canvas.strings) print('text = {}.'.format(text)) #-------------------- doc = PDFDocument(fd, password='******') page_one = next(doc.pages()) print('page_one.Contents = {}.'.format(page_one.Contents)) #-------------------- try: doc = PDFDocument(fd, password='******') #viewer = SimplePDFViewer(fd, password='******') except ValueError as ex: print('ValueError raised: {}.'.format(ex)) finally: fd.close()
def get_simple_pdf_text(self, file): pdf_text = "" viewer = SimplePDFViewer(file) viewer.render() for canvas in viewer: pdf_text += "".join(canvas.strings) if not pdf_text.strip(): return "" pdf_text = re.sub(r'\s+', ' ', pdf_text) pdf_text = pdf_text.replace('% Chg', ' % Chg ') pdf_text = pdf_text.split('% Chg')[-1].strip() pdf_text = pdf_text.replace('%', '% ') find_worlds = [] PATTERN_WORLD = r"(?P<name>[a-zA-Z\(\)\&]+)" for t in re.finditer(PATTERN_WORLD, pdf_text): find_worlds.append(t["name"]) for word in find_worlds: pdf_text = pdf_text.replace(word, f'{word} ') return re.sub(r'\s+', ' ', pdf_text).strip()
def _text_from_pdf(self, pdf_link: str): with tempfile.NamedTemporaryFile() as tf: with open(tf.name, mode="wb") as f: f.write(requests.get(pdf_link).content) with open(tf.name, mode="rb") as f: viewer = SimplePDFViewer(f) viewer.render() raw_text = "".join(viewer.canvas.strings) return raw_text
def calculate_page_count(filepath): with open(filepath, "rb") as fd: viewer = SimplePDFViewer(fd) page = 0 while True: try: page += 1 viewer.navigate(page) except PageDoesNotExist: break return page
def xobject_image_example(): pdf_filepath = './example-image-xobject.pdf' try: fd = open(pdf_filepath, 'rb') doc = PDFDocument(fd) # Extract XObject image. page = next(doc.pages()) print('page.Resources.XObject = {}.'.format(page.Resources.XObject)) xobj = page.Resources.XObject['img0'] print('xobj.Type = {}, xobj.Subtype = {}.'.format(xobj.Type, xobj.Subtype)) pil_image = xobj.to_Pillow() #pil_image.save('./extract-logo.png') #-------------------- # Extract Images: a very simple way. viewer = SimplePDFViewer(fd) viewer.render() all_page_images = viewer.canvas.images if 'img0' in all_page_images: img = all_page_images['img0'] print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype)) all_page_inline_images = viewer.canvas.inline_images if all_page_inline_images: img = all_page_inline_images[0] print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype)) finally: fd.close() #-------------------- pdf_filepath = './tutorial-example.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd) # Extract image masks. viewer.navigate(5) viewer.render() inline_images = viewer.canvas.inline_images image_mask = next(img for img in inline_images if img.ImageMask) pil_img = image_mask.to_Pillow() #pil_img.save('./mask.png') finally: fd.close()
def read(path): try: print('\n=> Nubank Robot: In Progress...') with open(path, 'rb') as file: viewer = SimplePDFViewer(file) viewer.navigate(1) while True: try: viewer.render() if 'TRANSAÇÕES' in viewer.canvas.strings: break viewer.next() except: print( 'Nubank Robot: Não foi achado dados de transações na fatura do nubank' ) content = list( filter(lambda s: len(s.strip()), viewer.canvas.strings))[3:-6] result = [[value, content[index * 3 + 1], content[index * 3 + 2]] for index, value in enumerate(content[::3])] result.insert(0, ['Fatura Nubank']) print(' * Nubank Robot: Done\n') return result except: print('Nubank Robot: Tivemos um erro ao ler a fatura do nubank\n') return False
def create_sample_pdf(pdf_path): fd = open(pdf_path, "rb") reader_viewer = SimplePDFViewer(fd) reader_viewer.render() markdown = reader_viewer.canvas.text_content pdf_str = reader_viewer.canvas.strings rw_viewer = PdfReader(pdf_path) rw_content = rw_viewer.pages[0].Contents.stream pdf = PdfFileReader(pdf_path) pdf_writer = PdfFileWriter() report_page = pdf.getPage(0) report_page.extractText() return
def navigate_pages(doc: PDFDocument, viewer: SimplePDFViewer): for i, page in enumerate(doc.pages(), 1): # navigate to page viewer.navigate(i) # render the page viewer.render() # collapse that ass page_strings: List[str] = viewer.canvas.strings.copy() merge_ranges = get_line_ranges(strings_list=page_strings) page_strings = establish_uniformity(strings_list=page_strings, line_range_list=merge_ranges) get_county_election_office_info(strings_list=page_strings)
def readPDF(pdfFile): from pdfreader import PDFDocument, SimplePDFViewer fd = open(pdfFile, "rb") viewer = SimplePDFViewer(fd) viewer.render() countyHospitalData = {} compiled = "" for stringData in viewer.canvas.strings: if not stringData.isnumeric(): compiled = compiled + stringData else: countyHospitalData[compiled] = stringData if compiled == 'Wright': break compiled = "" return countyHospitalData
def parse(self): statements = [] statement_files = list_statement_files(self.input_dir, "pdf") if not statement_files: logger.error(f"No statement files found.") raise SystemExit(1) logger.info( f"Collected statement files for processing: {statement_files}.") for statement_file in statement_files: logger.debug(f"Processing statement file[{statement_file}]") with open(statement_file, "rb") as fd: viewer = SimplePDFViewer(fd) activities = self.extract_activities(viewer) if not activities: continue statements.append(activities) statements = sorted( statements, key=lambda k: k[self.get_first_non_ssp_activity_index(k)][ "trade_date"]) return [ activity for activities in statements for activity in activities ]
def form_text_extraction_example(): pdf_filepath = './example-form.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd) viewer.render() plain_text = ''.join(viewer.canvas.strings) print('("Farmworkers and Laborers" in plain_text) = {}.'.format('Farmworkers and Laborers' in plain_text)) print('sorted(list(viewer.canvas.forms.keys())) = {}.'.format(sorted(list(viewer.canvas.forms.keys())))) form9_canvas = viewer.canvas.forms['Fm9'] print('"".join(form9_canvas.strings) = {}.'.format(''.join(form9_canvas.strings))) finally: fd.close()
class ParserInterface: def __init__(self, pdf_path): self.pdf_path = pdf_path fd = open(pdf_path, "rb") doc = PDFDocument(fd) self.viewer = SimplePDFViewer(fd) self.pages = [p for p in doc.pages()] def contains(self, msg: str, page: int) -> bool: self.viewer.navigate(page) self.viewer.render() if msg in self.viewer.canvas.strings: return True return False def process(self, show_progress: bool) -> PDFContents: pass
def main(): # Get the PDF r = requests.get( "https://www.sos.arkansas.gov/uploads/elections/ARCountyClerks.pdf") # Pass byte stream to PDFDocument parser (used for iterating through pages) doc = PDFDocument(r.content) # Pass byte stream to PDF viewer (used for reading strings on pages) viewer = SimplePDFViewer(r.content) navigate_pages(doc, viewer) pprint(ELECTION_OFFICE_INFO)
def parse_vaccinations(filename): # Read pdf (for metrics) with open(filename, mode="rb") as f: viewer = SimplePDFViewer(f) viewer.render() # Get list with strings strs = viewer.canvas.strings # Get indices idx_total_vax = strs.index("ümumi sayı") idx_dose_1 = strs.index("1-ci mərhələ üzrə ") idx_dose_2 = strs.index("2-ci mərhələ üzrə ") # Get metrics total_vaccinations = max([int(s) for s in strs[idx_total_vax:idx_dose_1] if s.isnumeric()]) dose_1 = max([int(s) for s in strs[idx_dose_1:idx_dose_2] if s.isnumeric()]) dose_2 = max([int(s) for s in strs[idx_dose_2:] if s.isnumeric()]) # Sanity check if dose_1 + dose_2 != total_vaccinations: raise ValueError( f"Apparently, dose_1 + dose_2 != total_vaccinations ({dose_1} + {dose_2} != {total_vaccinations})" ) return total_vaccinations, dose_1, dose_2
def read(path): print('=> Neon Robot: In Progress...') with open(path, 'rb') as file: viewer = SimplePDFViewer(file) viewer.navigate(2) viewer.render() full_string = ''.join(viewer.canvas.strings) re_pattern = '(.*R\$CartãoData)(.*)(Fique atento:Pagamento Mínimo:.*)' bill_string = sub(re_pattern, r'\2', full_string) after_date_spaces = sub('(.\d{2}\/\d{2}\/\d{4})(.)', r'\1--space--\2', bill_string) before_date_spaces = sub('(.)(\d{2}\/\d{2}\/\d{4})', r'\1--space--\2', after_date_spaces) currency_spaces = sub('(.)(R\$\d)', r'\1--space--\2', before_date_spaces) remove_card_column = sub('(Físico|Virtual)', '', currency_spaces) remove_currency_string = sub('R\$', '', remove_card_column) bill_list = remove_currency_string.split('--space--') content = list(filter(lambda s: len(s.strip()), bill_list)) result = [[value, content[index * 3 + 1], content[index * 3 + 2]] for index, value in enumerate(content[::3])] result.insert(0, ['Fatura neon']) print(' * Neon Robot: Done\n') return result
def get_text_pypdf(DOI:str) -> str: try: """gets the text from a given DOI""" hostname = socket.gethostname() path = pathlib.Path(__file__).parent.absolute() name = hostname + str(DOI).replace("/", "") + ".pdf" fp = Path(path / "pdfs" / name) # build filepath url = "https://www.medrxiv.org/content/" + str(DOI) + "v1.full.pdf" # build url response = requests.get(url) fp.write_bytes(response.content) # save .pdf fd = open(str(path) + "/pdfs/" + name, "rb") # open with pdfreader doc = PDFDocument(fd) all_pages = [p for p in doc.pages()] # get pages viewer = SimplePDFViewer(fd) # use simple viwer text = "" for p in range(len(all_pages)): # for each page viewer.navigate(p + 1) # nav to page try: viewer.render() # render -> clean and strip text += (u"".join(viewer.canvas.strings).encode(sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n' except OverflowError: pass fd.close() return text.lower() except Exception as e: print(e, DOI) return ""
def parse_statements(statement_files): statements = [] for statement_file in statement_files: with open(statement_file, "rb") as fd: viewer = SimplePDFViewer(fd) activities = extract_activities(viewer) if not activities: continue statements.append(activities) statements = sorted(statements, key=lambda k: k[0]["trade_date"]) return [activity for activities in statements for activity in activities]
def page_extractor(filepath, page_number): with open(filepath, "rb") as fd: viewer = SimplePDFViewer(fd) viewer.navigate(page_number) viewer.render() content = viewer.canvas.strings # content = content[3:] # remove page number text = ''.join(content) print('extracted page {}'.format(page_number), file=sys.stderr) return Page(page_number, text)
def pdfToText(string: str) -> list: fd = open(string, "rb") viewer = SimplePDFViewer(fd) plain_text = [] try: while True: viewer.render() plain_text += viewer.canvas.strings viewer.next() except PageDoesNotExist: pass return plain_text
def GetFrontPageText(document, ID_page=0): read_pdf = PyPDF2.PdfFileReader(document) page = read_pdf.getPage(ID_page) page_text = page.extractText() if len(page_text) == 0: viewer = SimplePDFViewer(document) viewer.navigate(ID_page + 1) viewer.render() page_text = ''.join(viewer.canvas.strings) return page_text
def uploaded_file(): if request.method == 'POST': f = request.files['file'] filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename)) f.save(filepath) fd = open(filepath, "rb") doc = PDFDocument(fd) version = doc.header.version print(doc.metadata) creationDate = doc.metadata.get('CreationDate') dataType = doc.metadata.get('Subtype') #data methods viewer = SimplePDFViewer(fd) textData = [] for canvas in viewer: #print(canvas.strings) textData += canvas.strings tempstring = '' textWords = [] for character in textData: if character != ' ': tempstring += character else: if tempstring: textWords.append(tempstring) tempstring = '' print(secure_filename(f.filename)) print(creationDate) print(textWords) fileDocument = { "name": secure_filename(f.filename), "creationDate": creationDate, "text": textWords } collection.insert_one(fileDocument) return 'file uploaded successfully'
def init_cmb_from_pdf(month): filename = FILE_PATH.format(str(month).zfill(2)) # logger.info(filename) fd = open(filename, "rb") doc = PDFDocument(fd) all_pages = [p for p in doc.pages()] # logger.info(len(all_pages)) viewer = SimplePDFViewer(fd) records = [] for i in range(len(all_pages)): viewer.navigate(i+1) viewer.render() records = np.append(records, viewer.canvas.strings[4:]) head = np.where(records == '记账日')[0][0] tail = np.where(records == '本期还款总额')[0][-1] records = records[head:tail] # title_cn = records[:5] # title_en = records[5:11] records = records[11:] column_cn = ['交易日' '交易摘要' '人民币金额' '卡号末四位' '记账日' '交易地金额'] column_en = ['transaction_date', 'transaction_description', 'transction_amount', 'card_number', 'bill_date', 'str_rmb'] # Data: ['' '掌上生活还款' '-3,011.49' '9978' '07/24' '-3,011.49'] df = pd.DataFrame(records.reshape( [int(len(records)/6), 6]), columns=column_en) df['type'] = 'cmb' df['transaction_date'] = df['transaction_date'].apply( lambda _: '2020/' + _) df['transaction_date'] = pd.to_datetime( df['transaction_date'], format="%Y/%m/%d", errors='coerce') df['transction_amount'] = df['transction_amount'].apply( lambda _: decimal_from_value(_)) df = df[['transaction_date', 'transction_amount', 'transaction_description', 'type']] return df
def Symptom_pdf(): url = 'https://covid-assets.joinzoe.com/latest/covid_symptom_study_report.pdf' response = requests.get(url=url, proxies={}) data = response.content symptom_pdf = open('ss.pdf', 'wb') symptom_pdf.write(data) symptom_pdf.close() response.headers viewer = SimplePDFViewer(data) viewer.navigate(7) viewer.render() for k, v in viewer.canvas.images.items(): image = v.to_Pillow() name = 'ss' + k + '.png' image.save(name) print(name)
def parse_statements(statement_files): statements = [] for statement_file in statement_files: logger.debug(f"Processing statement file[{statement_file}]") activities = [] if statement_file.endswith('.pdf'): with open(statement_file, "rb") as fd: viewer = SimplePDFViewer(fd) activities = extract_activities_from_pdf(viewer) elif statement_file.endswith('.csv'): with open(statement_file, "r") as fd: viewer = csv.reader(fd, delimiter=",") activities = extract_activities_from_csv(viewer) if not activities: continue statements.append(activities) statements = sorted(statements, key=lambda k: k[0]["trade_date"]) return [activity for activities in statements for activity in activities]
def hyperlink_and_annotation_tutorial(): pdf_filepath = './annot-sample.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd) viewer.navigate(1) viewer.render() plain_text = ''.join(viewer.canvas.strings) print('"http" in plain_text = {}.'.format('http' in plain_text)) print('len(viewer.annotations) = {}.'.format(len(viewer.annotations))) links = [annot.A.URI for annot in viewer.annotations if annot.Subtype == 'Link'] print('links = {}.'.format(links)) finally: fd.close()