def scrape_OPR800(filename): pdf = pdfquery.PDFQuery(filename) pages = pdf.doc.catalog['Pages'].resolve()['Count'] print("pdf has %d pages" % pages) totals = [] date = None for i in range((pages / 10) + 1): try: # load the next 10 pages pdf = pdfquery.PDFQuery(filename) pdf.load(range(i * 10, min((i + 1) * 10, pages))) # if we don't already have the date, search for it if date is None: date_field = "EI Date To: " date = pdf.pq("LTTextBoxHorizontal:contains('%s')" % date_field)[0].text date = date[date.index(date_field) + len(date_field):] if " " in date: date = date[:date.index(" ")] print("got date (%s)" % date) # search for the totals and append them total = pdf.pq('LTTextLineHorizontal:contains("Total for")') for t in total: totals.append(t.text) print("finished page %d" % (min((i + 1) * 10, pages))) del pdf except Exception as exc: print("error in %s on page %s (%s)" % (filename, ((i + 1) * 10), exc)) return (totals, date)
def gen_data(path, pages): pdf = pdfquery.PDFQuery(path) pdf.load(pages["main_page"][0] - 1) data = { "year": get_cy(pdf), "fund_ein": get_fund_ein(pdf), "fund_name": get_fund_name(pdf), "partner_ein": get_partner_ein(pdf), "beginning_ca": get_beginning_ca(pdf), "capital_cont": get_cap_contr(pdf), "cy_increase": get_cy_increase(pdf), "withdrawls": get_withdrawals(pdf), "ending_ca": get_ending_ca(pdf), } box_data = get_box_detail(pdf) pdf = pdfquery.PDFQuery(path) pdf.load(pages['Item_L_detail'][0] - 1) data['summary_income_loss_item_l'] = summary_income_loss_item_l( pdf, pages['Item_L_detail'][0] - 1) data['less_deductions_item_l'] = less_deductions_item_l( pdf, pages['Item_L_detail'][0] - 1) box_data = calculate_other_income(data, box_data) box_data = calculate_other_deductions(data, box_data) data.update(box_data) return data
def pdfquery_FindText(filenamme, words, offset): # xfile : the PDF file in which to look # xString : the string to look for start_time = time.time() querys = [f'LTTextLineHorizontal:contains("{word}")' for word in words] res = defaultdict(lambda: PreallocatedList(1000, int)) pdf = pdfquery.PDFQuery(filenamme, parse_tree_cacher=FileCache("tmp/")) page_num = 0 while True: try: pdf.load(page_num) query_objs = [pdf.pq(query) for query in querys] for query_obj, word in zip(query_objs, words): if query_obj: res[word].append(page_num + offset) page_num += 1 if page_num % 30 == 0: del pdf collected = gc.collect() print(f"Garbage collector: collected {collected} objects.") pdf = pdfquery.PDFQuery(filenamme, parse_tree_cacher=FileCache("tmp/")) # if page_num % 100 == 0: print(page_num) # break except StopIteration: break print("--- Batch futási idő: %s másodperc ---" % (time.time() - start_time)) return res
def test_annot_dereferencing(self): """ See issues #37, #42. """ pdf = pdfquery.PDFQuery("tests/samples/bug37.pdf") pdf.load() pdf = pdfquery.PDFQuery("tests/samples/bug42.pdf") pdf.load()
def convert(filename, filepath = ""): try: if filepath: pdf = pdfquery.PDFQuery(filepath + "/" + filename) pdf.load() with open('{}/{}.xml'.format(filepath, filename.replace(".pdf", "")),'wb') as f: f.write(etree.tostring(pdf.tree, pretty_print=True)) else: pdf = pdfquery.PDFQuery(filename) pdf.load() with open('{}.xml'.format(filename.replace(".pdf", "")),'wb') as f: f.write(etree.tostring(pdf.tree, pretty_print=True)) except: print(traceback.format_exc())
def read_cordinates1(path): PagePosDict = defaultdict() page_num = [] PageDict = defaultdict() """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file""" pdf = pdfquery.PDFQuery(path) pdf_pages = PdfFileReader(open(path, 'rb')) pages = pdf_pages.getNumPages() for i in range(0, pages): try: pdf.load(i) print(i) JQuery = pdf.pq('LTPage') for j in JQuery("LTTextLineHorizontal"): try: PageDict[i].append(JQuery(j).text()) except KeyError: PageDict[i] = [JQuery(j).text()] cordinates = list() cordinates.append(i) page_num.append(i) cord = JQuery(j).attr('bbox') for a in ['[', ']']: cord = cord.replace(a, '') for a in cord.split(', '): cordinates.append(float(a)) PagePosDict[tuple(cordinates)] = JQuery(j).text() except Exception: continue return PagePosDict, PageDict, page_num
def read_cordinates12(path): PagePosDict = defaultdict() """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file""" pdf = pdfquery.PDFQuery(path) for i in range(0, 5): try: pdf.load(i) print(i) JQuery = pdf.pq('LTPage') for j in JQuery("LTTextBoxHorizontal"): cordinates = list() cordinates.append(i) cord = JQuery(j).attr('bbox') for a in ['[', ']']: cord = cord.replace(a, '') for a in cord.split(', '): cordinates.append(float(a)) PagePosDict[tuple(cordinates)] = JQuery(j).text() except Exception: continue return PagePosDict
def check_regexs(self, regexs, search_extensions, enable_pdf): """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document""" if self.type == 'ZIP': try: if get_ext(self.path) == '.docx': doctext = docx2txt.process(self.path) self.check_text_regexs(doctext, regexs, '') if zipfile.is_zipfile(self.path): zf = zipfile.ZipFile(self.path) self.check_zip_regexs(zf, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid ZIP file') except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'TEXT': try: file_text = read_file(self.path, 'rb') self.check_text_regexs(file_text, regexs, '') except WindowsError: self.set_error(sys.exc_info()[1]) except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'SPECIAL': if get_ext(self.path) == '.msg': try: msg = msmsg.MSMSG(self.path) if msg.validMSG: self.check_msg_regexs(msg, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid MSG file') msg.close() except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) if enable_pdf: if get_ext(self.path) == '.pdf': try: pdf = pdfquery.PDFQuery(self.path) pdf.load() self.check_pdf_regexs(pdf, regexs, '') except: self.set_error(sys.exc_info()[1]) if get_ext(self.path) == '.mdb': try: self.check_access_regexs(self.path, 'mdb', regexs) except: self.set_error(sys.exc_info()[1]) return self.matches
def process_pdf(pdf_link, filename, folder): print(pdf_link) pdf = pdfquery.PDFQuery(pdf_link) pdf.load() pdf_holder = [] as_i = -20 news_alert_date = '' pages = pdf.pq('LTPage') #pdf.tree.write("test2.xml", pretty_print=True, encoding="utf-8") ########### date_label = pdf.pq( pdf.pq(pdf.pq(pages[0])('LTTextBoxHorizontal')[0])( 'LTTextLineHorizontal')[2]).text() date_label = date_label[date_label.find(':') + 1:].strip() date_label = ' '.join(date_label.split()[1:4]) print(date_label) print("--------") for i, p in enumerate(pages): page = pdf.pq(p) URLs = page('Annot') for i, url in enumerate(URLs): url = str(pdf.pq(url)) formatted = url[url.find('url=') + len('url='):] formatted = formatted[:formatted.find('&')] link_to_story = formatted if (link_to_story[0:4] == "http"): pdf_holder.append([date_label, link_to_story]) df = pd.DataFrame(pdf_holder, columns=['feed-date', 'link']) return (df)
def add_all_options_of_one_trim_to_file(range_begin, range_end): decrement = 9.1 options_deviation = 9 for x in range(range_begin, range_end): print(x) pdf = pdfquery.PDFQuery('windowsticker (%d).pdf' % x, parse_tree_cacher=FileCache("/tmp/")) pdf.load() equipment_group_label = pdf.pq('LTTextLineHorizontal:contains("EQUIPMENT GROUP")') options_label = pdf.pq('LTTextLineHorizontal:contains("OPTIONAL EQUIPMENT")') bottom_corner_equip_group = float(equipment_group_label.attr('y0')) left_corner = float(options_label.attr('x0')) bottom_corner_options = float(options_label.attr('y0')) position1 = 0 while True: options = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner_options - options_deviation - position1,left_corner + 300, bottom_corner_options - position1)).text() #only reads 6 if options == "": break options_file.write(options + '\n') position1 += decrement equipment = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner_equip_group,left_corner + 300, bottom_corner_equip_group + 9)).text() options_file.write(equipment + '\n') options_file.close()
def setUpClass(cls): cls.pdf = pdfquery.PDFQuery( "tests/samples/IRS_1040A.pdf", parse_tree_cacher=FileCache("/tmp/") if sys.argv[1] == 'cache' else None, ) cls.pdf.load()
def parse(self, response): self.logger.info('Getting faq at %s', response.url) #print(dir(response.body)) #return data = io.BytesIO() data.write(response.body) #with open('./tmp.pdf', 'w') as f: # f.write(response.body) # with open('tmp.pdf', 'r') as f: # input = open('./tmp.pdf', 'r') pdf = pdfquery.PDFQuery(data) #pdf = pdfquery.PDFQuery("./tmp.pdf") pdf.load() text = pdf.tree.xpath('//LTTextLineHorizontal//text()') print(text) faqItem = FaqItem() #faqItem['answer'] = ''.join(im_siblings) #aqItem['url'] = response.url yield faqItem
def getxml(filename): pdf = pdfquery.PDFQuery(filename + '.pdf') pdf.load() tree_root = pdf.tree with open(filename + '.xml', 'w') as f: f.write(etree.tostring(tree_root, pretty_print=True))
def processRequest(req): if req.get("result").get("action") != "searchPDF": return {} result = req.get("result") parameters = result.get("parameters") searchText = parameters.get("searchText") if searchText is None: return {} strTxt = 'LTTextLineHorizontal:contains("' + str(searchText) + '")' from pdfquery.cache import FileCache pdf = pdfquery.PDFQuery("rms-160-rn.pdf", parse_tree_cacher=FileCache("tmp/")) pdf.load() results = [] count = 0 for pq1 in pdf.pq(strTxt): page_pq = pq1.iterancestors( 'LTPage').next() # Use just the first ancestor if pq1.text is None: results.append({ "page#": page_pq.get("pageid"), "txt": pq1[0].text }) else: results.append({"page#": page_pq.get("pageid"), "txt": pq1.text}) count = count + 1 if count == 5: break res = makeResult(results) return res
def read_cordinates(path, page_no=None): """This function will read contents from the pages mentioned""" import pdfquery from collections import defaultdict """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file""" PagePosDict = defaultdict() pdf = pdfquery.PDFQuery(path) if page_no == None: page_no = range(6) global PLAINTIFF, DEFENDANT try: for i in page_no: pdf.load(i) JQuery = pdf.pq('LTPage') if JQuery.text().find('Service of Process Transmittal') >= 0: #i+=1 continue for j in JQuery("LTTextBoxHorizontal"): cordinates = list() cordinates.append(i) cord = JQuery(j).attr('bbox') for a in ['[', ']']: cord = cord.replace(a, '') for a in cord.split(', '): cordinates.append(float(a)) PagePosDict[tuple(cordinates)] = JQuery(j).text() #i+=1 except Exception: return (PagePosDict) return (PagePosDict)
def test_unicode_text(self): pdf = pdfquery.PDFQuery("tests/samples/bug18.pdf") pdf.load() self.assertEqual( pdf.pq('LTTextLineHorizontal:contains("Hop Hing Oils")').text(), (u'5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c ' u'\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9'))
def test_xml_conversion(self): """ Test that converted XML hasn't changed from saved version. """ pdf = pdfquery.PDFQuery("tests/samples/bug28.pdf") pdf.load() self.assertValidOutput(pdf, "bug28_output")
def getUser_FromPdf(self, pdfUrl): ''' read from pdf file and get username and password :param pdfUrl:the pdf path :return:None ''' web_file = urllib.request.urlopen(pdfUrl) local_file = open('tempPdfFile.pdf', 'wb') local_file.write(web_file.read()) web_file.close() local_file.close() pdf = pdfquery.PDFQuery("tempPdfFile.pdf") pdf.load() model = pdf.pq( 'LTTextLineHorizontal:contains("Model")').text().replace( "Model", "") userName = pdf.pq( 'LTTextLineHorizontal:contains("username")').text().text().replace( "username", "") password = pdf.pq( 'LTTextLineHorizontal:contains("password")').text().text().replace( "password", "") self.listPdf.append({ 'Model': model, 'Username': userName, 'Password': password })
def main(): # Number of arguments must be 2 # Otherwise error will be thrown if len(sys.argv) < 3: print "\nUsage : python pdf_extract <file_name>.pdf <output_file>" print "Note : XML file will be created. No need to put .xml extension in output file name\n" else: try: # Takes the PDF file from command line argument pdf = pdfquery.PDFQuery(sys.argv[1]) except: print "File doesn't exists in the specified directory!" return 0 # Loads page 0 into memory # Use pdf.load() to load entire file # Use pdf.load(1,3,5) to load select pages pdf.load(0) # Outputs the entire contents into output file # Use jQuery to extract data pdf.tree.write(str(sys.argv[2] + ".xml"), pretty_print=True) return 0
def pdf_link2soup(self, link): xml_path = '%s-%s' % (self.xml_tmp_path, md5(link).hexdigest()) # Link -> PDF pdf_content = urlopen(link).read() open(self.pdf_tmp_path, 'wb').write(pdf_content) # PDF -> XML pdf = pdfquery.PDFQuery(self.pdf_tmp_path, merge_tags=('LTChar'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=False, resort=True, parse_tree_cacher=None, laparams={ 'all_texts': True, 'detect_vertical': False }) pdf.load() pdf.tree.write(xml_path) # XML -> Soup xml_content = open(xml_path, 'r').read() return BeautifulSoup(xml_content, 'xml')
def extract(fileName, table_out): outs = [ open('../out/' + f + '.md', 'w', encoding='utf-8') for f in table_out ] for f in outs: print('| Nome | total de aulas |', file=f) print('| :---- | :---- |', file=f) progress = 0 d_progress = 100 / len(profs) for prof in profs: for i in range(len(fileName)): pdf = None try: pdf = pdfquery.PDFQuery('../out/Docentes/' + prof.name + fileName[i]) except FileNotFoundError as e: print( f'{prof.name} não possui informaçao de {fileName[i][1:-3]}' ) print( f'| {prof.name} | Nao possui arquivo {fileName[i][1:-3]}|', file=outs[i]) continue pdf.load(0) a = pdf.extract([('with_formatter', 'text'), \ ('total de aulas', f':in_bbox("{square(100, 131, 137, 81)}")')]) print( f'| {prof.name} | {a["total de aulas"][:len(a["total de aulas"]) // 2]} |', file=outs[i]) progress += d_progress print("%.2f %%" % progress) for f in outs: f.close()
def single_file_coordinates(pdf_menu_file): # create tree of elements menu_tree = pdfquery.PDFQuery(pdf_menu_file) menu_tree.load() # number of pages in the pdf_menu_file num_pages = len(menu_tree.tree.xpath('//*/LTPage')) print('number of pages', num_pages) menu_pd = pd.DataFrame( columns=['items', 'height', 'x0', 'x1', 'y0', 'y1', 'page_num']) for page_num in range(1, num_pages + 1): selector = '//LTPage[@pageid = "' + str(page_num) + '"]//*' treeExtract = menu_tree.tree.xpath(selector) menu_pd_page = single_page_coordinates(treeExtract, menu_tree) if menu_pd_page.shape[0] != 0: menu_pd_page['page_num'] = page_num menu_pd = pd.concat([menu_pd, menu_pd_page]) else: menu_pd = pd.DataFrame() return menu_pd
def load(self, file): self.file = file print('Loading %s' % str(self.file.split('\\')[-1:])) self.pdf = pdfquery.PDFQuery(self.file) page_count = len(self.pdf._pages) #self.pdf.load(list(range(2,page_count))) self.pdf.load()
def get_part_3(): list_3 = [] for pdf_path in part_3: pdf = pdfquery.PDFQuery(pdf_path, parse_tree_cacher=FileCache("./tmp/")) pdf.load() list_3.append(pdf) print("Finished getting part 3") return list_3
def main(): #1. Loads up the PDF and gets the number of pages if len(sys.argv) >= 2: pdf_name = sys.argv[1] if len(sys.argv) > 2: log_name = pdf_name + ".log.txt" print "Check Logfile (" + log_name + ")" sys.stdout = open(log_name, 'w') print "About to open " + str(pdf_name) sys.stdout.flush() else: #pdf_name = "binder_combined copy.pdf" exit("Error: Please Input a PDF") pdf = pdfquery.PDFQuery(pdf_name) pdf_count = pdf.doc.catalog['Pages'].resolve()['Count'] #2. Create Invoices Object, which A) holds an array of invoices # and B) holds invoices = Invoices() count = 0 for page in range(pdf_count): print "About to load " + str(page + 1), sys.stdout.flush() try: pdf.load(page) except: print "ERROR: Couldn't load page " + str(page + 1) sys.stdout.flush() invoices.addUnknownInvoicePage() continue identifiers = invoices.getIdentifiers() foundPage = False for identifier in identifiers: pdf_id = pdf.pq('LTTextLineHorizontal:contains("' + str(identifier) + '")') if (pdf_id): print identifier, searchFunction = identifiers[identifier] #Call the search function with a blank invoice Object blank_obj = Invoice() obj = searchFunction(blank_obj, pdf) invoices.add(page, searchFunction(Invoice(), pdf)) #print "Invoice:",obj.num,"PO:",obj.po,"JOB:",obj.job obj.printInvoice() foundPage = True break if (not foundPage): invoices.addUnknownInvoicePage() print "" printToPDF(pdf_name, invoices)
def parse_data(): filename = "../data/input/4_ini_3.pdf" pdf = pdfquery.PDFQuery(filename) pdf.load(1) print( pdf.pq('LTTextBoxHorizontal:contains("' + "Kalenderwoche:" + '")').text())
def load_file(file): global PAGE_ANOMALIES pdf = pdfquery.PDFQuery(file) if file in PAGE_ANOMALIES: pdf.load(PAGE_ANOMALIES[file]) else: pdf.load() LOG.print("\tFile loaded") return pdf
def __init__(self, pdfpath, pdf=None): self.logger = logging.getLogger("hsbcpdf.helpers.basestatement") self.pdfpath = pdfpath self.pdf = pdf if self.pdf is None: self.pdf = pdfquery.PDFQuery(pdfpath) self.pdf.load() self.page_height = None self.page_width = None self.account_number = None self.st_date = None
def extract_text_if_valid(filename): """Parses text from a pdf if it is valid; otherwise, returns an empty string. """ if not filename.endswith('.pdf'): print('Sorry, only PDF files are supported for text extraction.') return "" # prevent space normalization in order to separate different text blocks in each line pdf = pdfquery.PDFQuery(filename, normalize_spaces=False, resort=False) pdf.load() return extract_transactions(pdf)
def get_scraper(cls, pdfpath, pdf=None): if not os.path.exists(pdfpath): raise ScraperException(f'"{pdfpath}" file not found') if not os.path.isfile(pdfpath): raise ScraperException(f'"{pdfpath}" not a file') pdf = pdfquery.PDFQuery(pdfpath) pdf.load() for s in cls._scrapers: if s.probe_bank(pdf) and s.probe_type(pdf): logger.debug("pdf file matches {}.{}".format( s.st_bank, s.st_type)) return s(pdfpath, pdf)