def parse_pdf(file_obj): pdf = pdfplumber.load(file_obj) checks = pd.concat(list(map(parse_page, pdf.pages)))\ .reset_index(drop=True) return checks[checks["state"] != "Totals"]
def extract_data(feed): data = "" with pdfplumber.load(feed) as pdf: pages = pdf.pages for p in pages: data = data + p.extract_text() return data # build more code to return a dataframe
def uploaded_pdf_to_text(uploaded_file): doc = [] pdf = pdfplumber.load(uploaded_file) for page in pdf.pages: doc.append(page.extract_text()) text_lookup_res = '\n'.join(doc) return text_lookup_res
def extract_data(feed): data = [] with pdfplumber.load(feed) as pdf: pages = pdf.pages for p in pages: data.append(p.extract_text()) return data
def extractDataFromPDFFile(pdfSource, x_coord, y_coord): text = '' with pdfplumber.load(pdfSource) as pdf: for page in pdf.pages: text1 = page.extract_text(x_tolerance=x_coord, y_tolerance=y_coord) text = text + text1 return text
def main(): args = parse_args() pdf = pdfplumber.load(args.infile, pages=args.pages) if args.format == "csv": to_csv(pdf, args.types, args.encoding) else: to_json(pdf, args.types, args.encoding)
def extractDataFromPDFFile(pdfSource): text = '' with pdfplumber.load(pdfSource) as pdf: for page in pdf.pages: text1 = page.extract_text() text = text + text1 return text
def _load_file(self): self.viewer.clear() path = self.paths[self.pathidx] filename = os.path.basename(path) try: if filename.split('.')[-1].lower() in ['jpg', 'png']: image = Image.open(path) pdf = io.BytesIO( pytesseract.image_to_pdf_or_hocr(image, extension='pdf')) self.pdf = pdfplumber.load(pdf) else: self.pdf = pdfplumber.open(path) self.viewer.display_pdf(self.pdf) self.doc_label.configure( text="{} of {}".format(self.pathidx + 1, len(self.paths))) self.logger.clear() self.logger.log("Showing invoice '{}'".format(path)) except WandException: result = messagebox.askokcancel( "Error", "ImageMagick Policy Error! Should InvoiceNet try to fix the error?" ) if result: result = self._fix_policy_error() if result: messagebox.showinfo( "Policy Fixed!", "ImageMagick Policy Error fixed! Restart InvoiceNet.") else: messagebox.showerror( "ImageMagick Policy Error", "Coud not fix ImageMagick policy. Rejecting the current pdf file!" ) except (IndexError, IOError, TypeError): pass
def parse_pdf(file_obj): pdf = pdfplumber.load(file_obj) # Note: As of Nov. 2019 file, first page is documentation checks_gen = map(parse_page, pdf.pages[1:]) checks = pd.concat(checks_gen).reset_index(drop=True) return checks[checks["state"] != "Totals"]
def test_loading_fileobj(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") with open(path, "rb") as f: with pdfplumber.open(f) as pdf: assert len(pdf.metadata) # Will be removed from library soon with open(path, "rb") as f: with pdfplumber.load(f) as pdf: assert len(pdf.metadata)
def main(): parser = argparse.ArgumentParser( description='Extract financial lines from a PDF document') parser.add_argument('infile', type=argparse.FileType('rb')) args = parser.parse_args() pdf = pdfplumber.load(args.infile) rows = get_finances(pdf) for r in rows: print(r)
def extract_text(path): """Returns a generator object with a list of rows for each page. :param path: can be a URL or system path to a pdf file. Usage:: If the file is small, you may be able to create one object containing all text in the pdf file: >>> pdf_pages = pdf_yeah.extract_text('https://www.nostarch.com/download/Automate_the_Boring_Stuff_sample_ch17.pdf') >>> pdf_full_text = [page for page in pdf_pages] Otherwise it's better to iterate through the generator like so: >>> import pdf_yeah >>> pdf_pages = pdf_yeah.extract_text('https://abc.xyz/investor/pdf/20160331_alphabet_10Q.pdf') >>> pg1 = next(pdf_pages) >>> print('\n'.join(pg1[:9])) UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 ________________________________________________________________________________________ FORM 10-Q ________________________________________________________________________________________ (Mark One) QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the quarterly period ended March 31, 2016 """ if path.startswith('http'): r = requests.get(path) fp = io.BytesIO(r.content) pdf = pdfplumber.load(fp) for page in pdf.pages: yield page.extract_text().split('\n') else: with open(path, 'rb') as fp: pdf = pdfplumber.load(fp) for page in pdf.pages: yield page.extract_text().split('\n')
def __init__(self, datasheet_path): self.path = Path(datasheet_path) self.pdf_file = PyPDF3.PdfFileReader(self.path.open('rb')) self.plumber = pdfplumber.load(self.path.open('rb')) self.raw_outline = [] self.tables, self.figures = {}, {} # type: Dict self.table_of_content = DataSheetNode('ROOT', [0]) self.table_root = DataSheetNode('TABLES', [-1]) self.table_of_content.append(self.table_root) self.fallback_table: DataSheetTableNode = None self.flatten_outline() self.sort_raw_outline() self.collect_tables()
def extract_table_to_dfs(page): pdf_page = pdfplumber.load(page) tables = pdf_page.pages[0].extract_tables() dfs = [] for table in tables: df = pd.DataFrame(table[1:], columns=table[0]) number_of_not_nans = np.sum(df.count()) if number_of_not_nans > 0: number_of_nans = df.isnull().sum().sum() percentage_of_nans = number_of_nans / (number_of_not_nans + number_of_nans) if percentage_of_nans <= NANS_THRESHOLD: dfs.append(df) return dfs
def extract_tables(self): data = [] raw_table = [] with pdfplumber.load(self.pdf) as pdf: pages = pdf.pages for p in pages: tbl = p.extract_tables() raw_table.append(tbl) #data.append(pd.DataFrame(tbl)) if len(tbl): column_names = tbl[0].pop(0) df = pd.DataFrame(tbl[0], columns=column_names) data.append(df) self.tables = data return data
def parse_pdf(file_obj): pdf = pdfplumber.load(file_obj) rects = pd.DataFrame(pdf.rects) chars = pd.DataFrame(pdf.chars) # Find the leftmost side of the rectangles that appear on each page. rect_counts = rects["x0"].value_counts() edges = rect_counts[ rect_counts == len(pdf.pages) ].sort_index().index edges = ((pd.Series(edges) / 2).round() * 2).drop_duplicates() # Use these edges to create boundaries, defining fields. bounds = list(zip(edges, edges[1:])) def parse_line(chars): fields = [ "".join(get_between(chars, x0, x1)["text"]) for x0, x1 in bounds ] parsed = list(map(parse_field, fields)) return parsed def parse_page_chars(chars): c = chars[ (chars["top"] >= DATA_START_TOP) & (chars["top"] < DATA_END_TOP) ] month = parse_month("".join(chars[ (chars["size"] == 14.183) & (chars["top"] > 28) ]["text"])) data = c.groupby((c["doctop"] / 3).round()).apply(parse_line) df = pd.DataFrame([ [ month ] + d for d in data ], columns=COLUMNS) df.loc[(df["state"] == "llinois"), "state"] = "Illinois" try: validate_data(df) except: raise Exception("Invalid data for " + month) return df checks = pd.concat([ parse_page_chars(chars[chars["pageid"] == p.pageid]) for p in pdf.pages ]).reset_index(drop=True) return checks
def parse(self, content): """ :type 主要使用pdfplumber模块进行pdf内容解析 :param content: 读取内存中的pdf数据 :return: 返回字符串数据 """ # 加载pdf文件, 类型为二进制 pdf = pdfplumber.load(content) targets = [] # 保存结果 # 获取每一页的pdf内容 for page in pdf.pages: # 获取当前页面的全部文本信息 words = page.extract_text() # 对内容进行清洗 word = words.replace(' ', '').replace('\n', '') targets.append(word) # 关闭pdf资源 pdf.close() return ''.join(targets)
def _load_file(self): self.viewer.clear() path = self.paths[self.pathidx] filename = os.path.basename(path) try: if filename.split('.')[-1].lower() in ['jpg', 'png']: image = Image.open(path) pdf = io.BytesIO( pytesseract.image_to_pdf_or_hocr(image, extension='pdf')) self.pdf = pdfplumber.load(pdf) else: self.pdf = pdfplumber.open(path) self.viewer.display_pdf(self.pdf) self.doc_label.configure( text="{} of {}".format(self.pathidx + 1, len(self.paths))) self.logger.clear() self.logger.log("Showing invoice '{}'".format(path)) except (IndexError, IOError, TypeError): pass
def new_get_auditor(url, page): ''' get audit firm name by searching the regex pattern on a page ''' rq = requests.get(url) if rq.status_code == 200: # logging.info('request success. start extracting text...') print('request success, loading pdf...') try: pdf = pdfplumber.load(BytesIO(rq.content)) txt = pdf.pages[page].extract_text() except: logging.warning(f'Not pdf file. check {url}.') return None txt = re.sub("([^\x00-\x7F])+", "", txt) # diu no chinese pattern = r'\n(?!.*?Institute.*?).*?(?P<auditor>.+?)(?:LLP\s*)?\s*((PRC.*?|Chinese.*?)?[Cc]ertified [Pp]ublic|[Cc]hartered) [Aa]ccountants' auditor = re.search(pattern, txt, flags=re.MULTILINE).group('auditor').strip() return auditor
def parse_pdf(file_obj): pdf = pdfplumber.load(file_obj, pandas=True) rects = pdf.rects chars = pdf.chars # Find the leftmost side of the rectangles that appear on each page. rect_counts = rects["x0"].value_counts() edges = rect_counts[ rect_counts == len(pdf.pages) ].sort_index().index # Use these edges to create boundaries, defining fields. bounds = list(zip(edges, edges[1:])) def parse_line(chars): fields = [ "".join(get_between(chars, x0, x1)["text"]) for x0, x1 in bounds ] parsed = list(map(parse_field, fields)) return parsed def parse_page_chars(chars): c = chars[ (chars["top"] >= DATA_START_TOP) & (chars["top"] < DATA_END_TOP) ].sort_values([ "doctop", "x0" ]) month = parse_month("".join(chars[ (chars["size"] == 14.183) & (chars["top"] > 28) ]["text"])) data = c.groupby("doctop").apply(parse_line) return pd.DataFrame([ [ month ] + d for d in data ], columns=COLUMNS) checks = pd.concat([ parse_page_chars(chars[chars["pageid"] == p.pageid]) for p in pdf.pages ]).reset_index(drop=True) assert(len(checks) > 0) assert((checks.fillna(0).sum(axis=1) != (checks["totals"] * 2)).sum() == 0) return checks
def _run_ocr(self): if self.pdf is None: return pdf_pages = list() for page in self.pdf.pages: image = page.to_image(resolution=100) pdf = pytesseract.image_to_pdf_or_hocr(image.original, extension='pdf') pdf_pages.append(pdf) pdf_writer = PyPDF2.PdfFileWriter() for page in pdf_pages: pdf = PyPDF2.PdfFileReader(io.BytesIO(page)) pdf_writer.addPage(pdf.getPage(0)) pdf = io.BytesIO() pdf_writer.write(pdf) self.pdf = pdfplumber.load(pdf) self.viewer.display_pdf(self.pdf)
def get_text(self, url): if url[-4:] == '.pdf': try: r = self.request('GET', url, stream=True, timeout=self.options['timeout'], verify=False) if 200 == r.status_code and 'application/pdf' == r.headers[ 'Content-Type']: with io.BytesIO() as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) pdf = pdfplumber.load(f) text = '' for i in range(0, len(pdf.pages)): page = pdf.pages[i] page_text = page.extract_text() if isinstance(page_text, str): text = text + page_text pdf.close() return text except Exception as e: self.alert(url + ' ' + str(e)) else: try: r = self.request('GET', url, stream=True, timeout=self.options['timeout'], verify=False) if 200 == r.status_code: return r.text except Exception as e: self.alert(url + ' ' + str(e))
def _run_ocr(self): if self.pdf is None: return pdf_pages = list() for page in self.pdf.pages: image = page.to_image(resolution=100) # myocr # text_strings, text_recs_alls = predict([image.original], language='chn') # boxes = [[(recs[0], recs[1]), (recs[4], recs[5])] for recs in text_recs_alls[0]][:-1] # imgdraw = ImageDraw.Draw(image.original) # for box in boxes: # imgdraw.rectangle(box,width=2,outline='red') # 画框 # boxes = [box.split(' ') for box in pytesseract.image_to_boxes(image.original).split('\n')][:-1] # imgcopy = image.original.copy() # imgdraw = ImageDraw.Draw(imgcopy) # for box in boxes: # x1,y1,x2,y2 = int(box[1]),int(box[2]),int(box[3]),int(box[4]) # imgdraw.rectangle([(x1,y1),(x2,y2)],width=2,outline='red') pdf = pytesseract.image_to_pdf_or_hocr(image.original, extension='pdf') pdf_pages.append(pdf) pdf_writer = PyPDF2.PdfFileWriter() for page in pdf_pages: pdf = PyPDF2.PdfFileReader(io.BytesIO(page)) pdf_writer.addPage(pdf.getPage(0)) pdf = io.BytesIO() pdf_writer.write(pdf) self.pdf = pdfplumber.load(pdf) self.viewer.display_pdf(self.pdf)
def extract_data_lonseddel(feed): data_list = [] with pdfplumber.load(feed) as pdf: page = pdf.pages[0] text = page.extract_text() for row in text.split('\n'): #st.write(row) if '1100' in row: global timer products_dict = {} text = row.split()[1] products_dict["Beskrivelse"] = text products_dict["Enheder"] = row.split()[-3] timer = row.split()[-3] products_dict["Sats"] = row.split()[-2] products_dict["Beløb"] = row.split()[-1] data_list.append(products_dict) if '1104' in row: products_dict = {} text_1 = row.split()[1] text_2 = row.split()[2] text = text_1 + " " + text_2 products_dict["Beskrivelse"] = text products_dict["Enheder"] = row.split()[-3] products_dict["Sats"] = row.split()[-2] products_dict["Beløb"] = row.split()[-1] data_list.append(products_dict) if '1330' in row: products_dict = {} text_1 = row.split()[1] text_2 = row.split()[2] text_3 = row.split()[3] text_4 = row.split()[4] text = text_1 + " " + text_2 + " " + text_3 + " " + text_4 products_dict["Beskrivelse"] = text products_dict["Enheder"] = row.split()[-3] products_dict["Sats"] = row.split()[-2] products_dict["Beløb"] = row.split()[-1] data_list.append(products_dict) if '3992' in row: products_dict = {} text_1 = row.split()[1] text_2 = row.split()[2] text_3 = row.split()[3] text_4 = row.split()[4] text_5 = row.split()[5] text = text_1 + " " + text_2 + " " + text_3 + " " + text_4 + " " + text_5 products_dict["Beskrivelse"] = text products_dict["Enheder"] = row.split()[-3] products_dict["Sats"] = row.split()[-2] products_dict["Beløb"] = row.split()[-1] data_list.append(products_dict) if 'Overført til reg./konto' in row: global udbetaling if '8100' in row: timer = row.split()[-2] products_dict = {} text_1 = row.split()[0] text_2 = row.split()[1] text_3 = row.split()[2] text = text_1 + " " + text_2 + " " + text_3 products_dict["Beskrivelse"] = text products_dict["Enheder"] = timer products_dict["Sats"] = " " udbetaling = row.split()[-1] products_dict["Beløb"] = udbetaling data_list.append(products_dict) if 'Lønseddel for perioden' in row: text_1 = row.split() #print(text_1) global start_dato, slut_dato,year_dato start_dato = str([' '.join(text_1[3:5])])[2:-2] slut_dato = str([' '.join(text_1[-3:-1])])[2:-2] year_dato = str([''.join(text_1[-1:])])[2:-2] return data_list # build more code to return a dataframe
((chars["fontname"] == "Arial") & (chars["size"] == 5.628)) ].copy() data = pdfplumber.utils.extract_columns(data_chars, x_tolerance=1, y_tolerance=1) if len(data.columns) == 6: data.columns = COLUMNS else: data.columns = [ "sivigila_code" ] + COLUMNS data = data.drop_duplicates().reset_index(drop=True) data[INT_COLS] = data[INT_COLS].astype(int) data["department"] = data["department"].str.strip().str.upper().apply(lambda x: DEPT_FIXES.get(x, x)) data["municipality"] = data["municipality"].str.strip().str.upper().apply(lambda x: MUNI_FIXES.get(x, x)) sums = data[INT_COLS].sum(axis=1) equalities = (sums == (data["zika_total"] * 2)).unique().tolist() assert(equalities == [ True ]) return data if __name__ == "__main__": import sys if hasattr(sys.stdin, "buffer"): buf = sys.stdin.buffer else: buf = sys.stdin pdf = pdfplumber.load(buf) data = parse(pdf) data.to_csv(sys.stdout, index=False, encoding="utf-8")
def receiveFile(): print("Receiving File", flush=True) length = int(request.form['length']) fileNames = json.loads(request.form['fileNames']) returnJson = {} absaDocument = {} sentimentWordDocument = {} corpus = [] corpusEntity = {} corpusRelation = [] for i in range(length): file = request.files[f'file{i}'] # Get filename fileName = fileNames[i] # Get file extension name, extension = os.path.splitext(fileName) print('POST SUCCESSFUL', fileName, flush=True) try: if extension == '.txt': byteString = file.read() encoding = chardet.detect(byteString)['encoding'] text = byteString.decode(encoding) elif extension == '.pdf': text = '' with pdfplumber.load(file) as pdf: for page in pdf.pages: text += page.extract_text() text = re.sub('\\\\', '', text) tempJson = runAlice(text) absaChapter = tempJson['sentiment'][2]['absaChapter'].copy() sentimentWordChapter = tempJson['sentiment'][2][ 'sentimentWordChapter'].copy() absaDocument = absa_document_combined_c(absaDocument, absaChapter, name) sentimentWordDocument = entity_sentimentwords_document( sentimentWordDocument, sentimentWordChapter) returnJson[name] = tempJson tempEntity = tempJson['ner']['ents'].copy() for entity in tempEntity: key = entity['text'] + '_' + entity['type'] if key in corpusEntity: corpusEntity[key]['value'] += 1 corpusEntity[key]['documents'].add(name) else: corpusEntity[key] = { 'id': entity['text'], 'label': entity['text'], 'value': 1, 'documents': set([name]), 'type': entity['type'], 'color': nercolors[entity['type']] } # corpusPassToRelation.extend(tempJson['ner'].pop('passToRelation')) corpus.append(text) print(f"Current Corpus Text: {corpus}", flush=True) newRelation = tempJson['relation'].copy() for relation in newRelation: relation['documents'] = [name] corpusRelation.append(relation) except Exception as err: print(err, "occured in" + fileName) except: print('Unknown error in' + fileName) if length > 1: print(f"Corpus being sent to overview {corpus}", flush=True) returnJson['Overview'] = getOverview(corpus, corpusEntity, corpusRelation, absaDocument, sentimentWordDocument, fileNames) print('RESULT', json.dumps(returnJson)) returnJson = jsonify(returnJson) return returnJson
import pikepdf import io import pdfplumber with io.BytesIO() as f: with pikepdf.open(r'C:\Users\BashamF\Documents\c06278453.pdf') as pdf: pdf.save(f) with pdfplumber.load(f) as pdf: print(pdf.pages[0].extract_text())
#!/usr/bin/env python import pandas as pd import pdfplumber import requests import datetime import re from io import BytesIO def parse_date(pdf): chars = pd.DataFrame(pdf.chars) updated_text = "".join(chars[ (chars["fontname"] == "Times New Roman") & (chars["doctop"] < 175) ].sort_values(["doctop", "x0"])["text"]) date_pat = r"UPDATED:\s+As of (.+)$" updated_date = re.search(date_pat, updated_text).group(1) d = datetime.datetime.strptime(updated_date, "%B %d, %Y") return d if __name__ == "__main__": URL = "https://www.fbi.gov/about-us/cjis/nics/reports/active_records_in_the_nics-index.pdf" raw = requests.get(URL).content pdf = pdfplumber.load(BytesIO(raw)) d = parse_date(pdf) print(d.strftime("%Y-%m"))
data = pdfplumber.utils.extract_columns(data_chars, x_tolerance=1, y_tolerance=1) if len(data.columns) == 6: data.columns = COLUMNS else: data.columns = ["sivigila_code"] + COLUMNS data = data.drop_duplicates().reset_index(drop=True) data[INT_COLS] = data[INT_COLS].astype(int) data["department"] = data["department"].str.strip().str.upper().apply( lambda x: DEPT_FIXES.get(x, x)) data["municipality"] = data["municipality"].str.strip().str.upper().apply( lambda x: MUNI_FIXES.get(x, x)) sums = data[INT_COLS].sum(axis=1) equalities = (sums == (data["zika_total"] * 2)).unique().tolist() assert (equalities == [True]) return data if __name__ == "__main__": import sys if hasattr(sys.stdin, "buffer"): buf = sys.stdin.buffer else: buf = sys.stdin pdf = pdfplumber.load(buf) data = parse(pdf) data.to_csv(sys.stdout, index=False, encoding="utf-8")
def save_line_items(invoice_file, forceOcr): folder = settings.UPLOAD_PATH if not os.path.exists(folder): os.makedirs(folder) # save file locally first for aws folder = folder fs = FileSystemStorage(location=folder) filename = fs.save(invoice_file.name, invoice_file) temp_pdf_path = folder + filename invoice_text = '' try: ocrmypdf.ocr(temp_pdf_path, temp_pdf_path, force_ocr=forceOcr) temp_file = open(temp_pdf_path, "r") with pdfplumber.load(temp_file.buffer) as pdf: page = pdf.pages[0] invoice_text = page.extract_text() except Exception as err: with pdfplumber.load(temp_pdf_path) as pdf: page = pdf.pages[0] invoice_text = page.extract_text() # Save to AWS upload_to_AWS(temp_pdf_path, invoice_file.name) # delete pdf and img after extraction is complete if os.path.isfile(temp_pdf_path): os.remove(temp_pdf_path) # Regular expressions delta_re = re.compile(r'(?i)DELTA') johnstone_re = re.compile(r'(?i)(JOHNSTONE)') carrier_re = re.compile(r'(?i)(Distributor)') capco_re = re.compile(r'(?i)(capco)') ferguson_re = re.compile(r'(?i)(ferguson)') meta_data = {} lines = invoice_text.split("\n") for i in range(len(lines)): line = lines[i] supplier = "" if delta_re.search(line): meta_data = parse_delta_invoice(invoice_text) supplier = "Delta" if johnstone_re.search(line): meta_data = parse_johnstone_invoice(invoice_text) supplier = "Johnstone" if carrier_re.search(line): meta_data = parse_carrier_invoice(invoice_text) supplier = "Carrier" if capco_re.search(line): meta_data = parse_capco_invoice(invoice_text) supplier = "Capco" if ferguson_re.search(line): meta_data = parse_ferguson_invoice(invoice_text) supplier = "Ferguson" if supplier: meta_data["supplier_id"] = Supplier.objects.filter( supplier_name__icontains=supplier)[0].id meta_data["invoice_date"] = meta_data["invoice_date"].strip() meta_data["invoice_number"] = meta_data["invoice_number"].strip() meta_data["new_invoice_name"] = supplier + " " + \ meta_data["invoice_date"].replace( "/", "-") + " " + meta_data["invoice_number"] + ".pdf" # create item and price keys for i in range(len(meta_data['line_items'])): item, price = meta_data['line_items'][i] item_key = "item" + str(i + 1) price_key = "price" + str(i + 1) meta_data['line_items'][i] = (item_key, item.strip(), price_key, price.strip()) break return meta_data
#!/usr/bin/env python import pandas as pd import pdfplumber import requests import datetime import re from io import BytesIO def parse_date(pdf): text = pdf.pages[0].extract_text(x_tolerance=5) date_pat = r"UPDATED:\s+As of (.+)\n" updated_date = re.search(date_pat, text).group(1) d = datetime.datetime.strptime(updated_date, "%B %d, %Y") return d if __name__ == "__main__": URL = "https://www.fbi.gov/about-us/cjis/nics/reports/active_records_in_the_nics-index.pdf" raw = requests.get(URL).content pdf = pdfplumber.load(BytesIO(raw)) d = parse_date(pdf) print(d.strftime("%Y-%m"))
def main(string): print("Enter your name, or an empty line to exit.") with pdfplumber.load(string) as pdf: first_page = pdf.pages[0] text = first_page.extract_text() print(text)
def parse_pdf(x_tolerance, y_tolerance, path=None, savePath=None, data=None): ''' function : 处理pdf :param:词间最大间距,行间最大间距,输入路径,输出路径 :return 无 ''' # 读入文件 if path != None: pdf = pdfplumber.open(path) elif data != None: pdf = pdfplumber.load(data) # 记录page行数 pdfRowNumber = 0 wb = Workbook() ws = wb.active for page in pdf.pages: pageContainer = [] #存储所有该page的字典 theMaxColNum = 0 #记录最大列数 words = page.extract_words(x_tolerance=x_tolerance, y_tolerance=y_tolerance, keep_blank_chars=True) pageContainer, theMaxColNum = compileByRowLocation( words, x_tolerance, y_tolerance) # 按照位置信息排序 for line in pageContainer: line.sort(key=itemgetter('x0')) pageContainer.sort(key=lambda line: line[0]['top']) # 对前排进行判断是否出现有表头或多行为一行的情况 if len(pageContainer[0]) < theMaxColNum: for i in range(len(pageContainer)): if len(pageContainer[i]) == theMaxColNum: repairList, repairNum = align_front_row( pageContainer[0:i + 1], theMaxColNum) for i in range(repairNum): del pageContainer[0] pageContainer.insert(0, repairList) break # 对最后一排进行判断 if len(pageContainer[-1]) < theMaxColNum: pageContainer[-1] = align_last_row(pageContainer[-2:], theMaxColNum) # 写入excel # ftTitle = Font(name='font',size=14) # ftText = Font(name = '',size=10) for idx, line in enumerate(pageContainer): for idy, item in enumerate(line): cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1) if item['text'] == '': pass elif item['text'] == None: ws.merge_cells(start_row=idx + 1 + pdfRowNumber, start_column=1, end_row=idx + 1 + pdfRowNumber, end_column=len(line)) break else: # cellIndex.font = ftText if item['text'].isdigit(): cellIndex.value = int(item['text']) elif is_float(item['text']): cellIndex.value = float(item['text']) else: cellIndex.value = item['text'] # 将该页的行数相加,使excel连续 pdfRowNumber += len(pageContainer) # 保存excel文件至本地 if savePath != None: wb.save(savePath) else: wb.save(path.replace('.pdf', '.xlsx'))
#讀取成文字後存回資料庫 def returnPdfcontent(content): sqlstr = "insert into proposal (content) VALUES (%s)" % (content) return DB.execution(DB.create, sqlstr) def returnHashtag(hashtags): for tag in hashtags: print("insert into hashtag (hashtag_name) VALUES ('%s')" % (tag[0])) for j in pdfdb["data"]: rq = requests.get(j["pdfUrl"]) pdf = pdfplumber.load(BytesIO(rq.content)) allText = "" for i in pdf.pages: allText += str(i.extract_text()) # print(allText) text = "" table = "" for i in pdf.pages: content = i.extract_table() if content != None: table = str(getColName(content)) break # page = pdf.pages[1] # text = page.extract_text() # print(allText)