f.write(resp.content) from glob import glob pdf_filenames = glob('CAWARN-*.pdf') for pdf_fname in pdf_filenames: print("This is a filename of a pdf:", pdf_fname) import csv import pdfplumber outfile = open('CAWARN12.csv', 'w') outcsv = csv.writer(outfile) pdf_fname12 = 'CAWARN-eddwarncn12.pdf' pdf = pdfplumber.open(pdf_fname12) page = pdf.pages for page in pdf.pages: table = page.extract_table() for row in table[1:]: outcsv.writerow(row) outfile.close outfile = open('CAWARN13.csv', 'w') outcsv = csv.writer(outfile) pdf_fname13 = 'CAWARN-eddwarncn13.pdf' pdf = pdfplumber.open(pdf_fname13) page = pdf.pages
def extract_title_main(file_name): title = "" subtitle = "" strapline = "" sub_strapline = "" main_text = "" check_list = ["0","1","2","3","4","5","6","7","8","9",","] with pdfplumber.open(os.getcwd()+ '/GICorpus2/'+file_name) as pdf: #총 페이지 수 total_pages = pdf.pages for page in total_pages: total_char = page.chars charsize = len(total_char) for ch in range(charsize): #폰트크기가 Decimal로 저장되어 있기 때문에 int로 변환 필요 temp = int(Decimal(total_char[ch].get("size"))) if temp == 23: #제목 title = title + total_char[ch].get("text") elif temp == 16: #소제목 strapline = strapline+ total_char[ch].get("text") if int(Decimal(total_char[ch+1].get("size"))) < 16: strapline = strapline + "\t" elif temp == 11: #소소제목 sub_strapline = sub_strapline + total_char[ch].get("text") else : #본문 (> , < , , , 등등 폰트크기 10 아닌거 포함) # 기울임체 제거 if total_char[ch].get("upright") == False: continue # 줄바꿈 - 는 스킵 if total_char[ch].get("text") == "-": if( int(Decimal(total_char[ch].get("x0"))) >220) and (int(Decimal(total_char[ch].get("x1"))) >220): continue main_text = main_text + total_char[ch].get("text") # . 뒤에 띄어쓰기가 없으면 문장 구분 못하는 문제 발견 --> . 나오면 뒤에 공백 추가 # 숫자, , 나올 때 제외함. #인덱스에러 발생해서 추가 (ch + 1 때문..) if ch< charsize -2: if (total_char[ch].get("text")==".") and not (total_char[ch+1].get("text") in check_list): main_text = main_text + " " title = title + "." f = open(os.getcwd()+"/GICorpus2/" + file_name[0:-4]+".txt", "w") f.write(title) f.write("\n\n") f.write(main_text) f.close()
def extractLinesFromPDF(filepath, agency, date): print filepath with pdfplumber.open(filepath) as pdf: # init lines list lines = list() # loop over pages for page_index, page in enumerate(pdf.pages): # crop page page = cropHeaderAndFooter(page, page_index) # convert to a list of lines with formatting lines += getLinesWithFormatting(page, page_index, agency, date) # convert font information into a set of ranked dummy vars lines = cleanFontNames(lines) lines = assignFontStyles(lines) # bucket left indentations into 5 ranked dummy vars lines = bucketLeftIndentation(lines, agency) return lines
import pdfplumber import os #os.chdir(r"D:\\code\\python_project\\python_code\\get_pdf\\") with pdfplumber.open(r"get_pdf\\延安高铁站.pdf") as pdf: pages = pdf.pages # 第一页的信息 for i in pages: text = i.extract_text() print(text)
"text_y_tolerance": 0, "text_x_tolerance": 2, }) table = pd.DataFrame([ [ month ] + row for row in _table ]) table.columns = COLUMNS table[table.columns[2:]] = table[table.columns[2:]].applymap(parse_value) table.loc[(table["state"] == "llinois"), "state"] = "Illinois" table = table.loc[lambda df: df["state"].fillna("").str.strip() != ""] try: validate_data(table) except: raise Exception("Invalid data for " + month) return table def parse_pdf(pdf): # Note: As of Nov. 2019 file, first page is documentation checks_gen = map(parse_page, pdf.pages[1:]) checks = pd.concat(checks_gen).reset_index(drop=True) return checks[checks["state"] != "Totals"] if __name__ == "__main__": with pdfplumber.open(sys.stdin.buffer) as pdf: checks = parse_pdf(pdf) checks.to_csv(sys.stdout, index=False, float_format="%.0f") sys.stderr.write("\r\n")
def test_issue_21(self): pdf = pdfplumber.open( os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf")) assert len(pdf.objects) pdf.close()
def test_issue_203(self): path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf") with pdfplumber.open(path) as pdf: assert len(pdf.objects)
import pdfplumber import csv pdf_name = 'raw-pdfs/crime-stats-2013.pdf' pdf = pdfplumber.open(pdf_name) page = pdf.pages[32] table = page.extract_table() with open('2013.csv', 'w') as outfile: outcsv = csv.writer(outfile) for row in table: outcsv.writerow(row)
'https://www.nccommerce.com/Portals/11/Documents/Reports/WARN/warn-2015.pdf', 'https://www.nccommerce.com/Portals/11/WARN/Warn2014.pdf', 'https://www.nccommerce.com/Portals/11/WARN/Warn-2013.pdf' ] for url in urls: pdf_fname = 'NCWARN-' + basename(url) print("Downloading", url, 'into', pdf_fname) resp = requests.get(url) with open(pdf_fname, 'wb') as f: f.write(resp.content) pdf_filenames = glob('NCWARN-*.pdf') for pdf_fname in pdf_filenames: print("This is a filename of a pdf:", pdf_fname) pdf = pdfplumber.open(pdf_fname) type(pdf) # PDF 1 pdf_fname = 'NCWARN-Warn.pdf' outfile = open('NCWARN-Warn.csv', 'w') outcsv = csv.writer(outfile) pdf = pdfplumber.open(pdf_fname) for page in pdf.pages: table = page.extract_table() for row in table[1:]: # note how I'm still skipping the header outcsv.writerow(row) outfile.close
import csv url_a = "https://jfs.ohio.gov/warn/" url_b = ".stm" NAMES = ["WARN_2015", "WARN2014", "WARN_2013", "WARN_2012"] for name in NAMES: fname_pdf = (name + ".pdf") url = (url_a + name + url_b) print("Downloading", url, "into", fname_pdf) resp = requests.get(url) with open(fname_pdf, "wb") as f: f.write(resp.content) f.close() outfile = open(name + ".csv", "w") outcsv = csv.writer(outfile) pdf = pdfplumber.open(fname_pdf) for page in pdf.pages: table = page.extract_table() for row in table[1:]: outcsv.writerow(row) outfile.close
def test_text_colors(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") with pdfplumber.open(path) as pdf: char = pdf.pages[0].chars[3358] assert char["non_stroking_color"] == [1, 0, 0]
def test_colors(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") with pdfplumber.open(path) as pdf: rect = pdf.pages[0].rects[0] assert rect["non_stroking_color"] == [0.8, 1, 1]
def test_password(self): path = os.path.join(HERE, "pdfs/password-example.pdf") with pdfplumber.open(path, password="******") as pdf: assert len(pdf.chars) > 0
path = "./data/full/" texts_path = "./data/texts/" pathlib.Path(texts_path).mkdir(parents=True, exist_ok=True) files = glob.glob(path + "*") for f in files: f_name = f.split("\\")[-1] txt_path = texts_path + f_name + ".txt" file_path = pathlib.Path(txt_path) if file_path.exists(): continue pdf = pdfplumber.open(f) text = u'' for page in pdf.pages: try: # since pdfplumber tries to convert "P14" into decimal for some reason et = page.extract_text() except: pass if et is not None: # since pdfplumber returns None when an empty page occurs text += et else: print("Extracted text is None in file " + f) file = open(txt_path, "wb") file.write(text.encode("utf-8", "ignore")) file.close()
filename = 'warn-{}.pdf'.format(i) with open(filename, 'wb') as save_file: save_file.write(resp.content) print('Saved to', filename) filenames.append(filename) ################################################################################ # CSV building + counting total = 0 with open('warn-2012-2014.csv', 'w') as outfile: outcsv = csv.writer(outfile) outcsv.writerow(['Company Name', 'Location', 'Employees\nAffected', 'Layoff\nDate']) # Manually write header for filename in filenames[3:]: pdf = pdfplumber.open(filename) for i, page in enumerate(pdf.pages): print ('Extracting page', i + 1, 'from', filename) table = page.extract_table() for row in table: if i == 0: continue # Skip header on first page of each doc try: total += int(row[2]) except: print('Couldn\'t get num employees from', row) outcsv.writerow(row) print(total, 'employeed affected from 2012-2014 dataset') # Don't count totals from here, because precalculated in pdf # These pdfs don't seem to have their tables extracted as well... with open('warn-2014-2016.csv', 'w') as outfile:
""" Run pdfplumber on sample PDFs. Install with `pip install pdfplumber`. """ import pdfplumber with open("pdfplumber.out", "w") as f: with pdfplumber.open("pdfs/1412.6980.pdf") as pdf: for page in pdf.pages: text = page.extract_text() f.write(text)
# ============================================================================= # 10.2 PDF文本解析基础 by 王宇韬 # ============================================================================= # 1.解析第一页的文本信息 import pdfplumber pdf = pdfplumber.open('公司A理财公告.PDF') # 打开PDF文件 pages = pdf.pages # 通过pages属性获取所有页的信息,此时pages是一个列表 page = pages[0] # 获取第一页内容 text = page.extract_text() # 通过 print(text) # 打印第一页内容 pdf.close() # 关闭PDF文件 # 2.解析全部页数的文本信息 import pdfplumber pdf = pdfplumber.open('公司A理财公告.PDF') pages = pdf.pages text_all = [] for page in pages: # 遍历pages中每一页的信息 text = page.extract_text() # 提取当页的文本内容 text_all.append(text) # 通过列表.append()方法汇总每一页内容 text_all = ''.join(text_all) # 把列表转换成字符串 print(text_all) # 打印全部文本内容 pdf.close()
def test_load_with_custom_laparams(self): # See https://github.com/jsvine/pdfplumber/issues/168 path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") laparams = dict(line_margin=0.2) with pdfplumber.open(path, laparams=laparams) as pdf: assert float(pdf.pages[0].chars[0]["top"]) == 66.384
def setUp(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") self.pdf = pdfplumber.open(path)
def Fin_Analysis_Mian(self, results): #获取参数 for param in results: bond_company = param[0] keyword = param[1] whether_cross_page = param[2] whether_around = param[3] print("证券公司名称:" + bond_company) pdf = pdfplumber.open('D:/年报/'+bond_company+'.pdf') table = [] list = [] for page in pdf.pages[1:400]: data = page.extract_text() if keyword in data: page_number = page.page_number print('开始读取数据页数:' + str(page_number)) for i in range(page_number - 2, page_number + 2): page_use = pdf.pages[i] for table_list in page_use.extract_tables(): table.append(table_list) #判断是指标 if keyword.endswith('结算备付金'): #获取所有的表格 for i in range(0, len(table[0:]) - 1): #获取范围类的每个表格 for j in range(0, len(table[0:][i])): #判断是否跨页 2有跨页 if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('客户备付金') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有备付金') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有'): list.append(table[0:][i]) if whether_cross_page == 2: #跨页在后为1 if whether_around == 1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有'): list.append(table[0:][i+1]) #跨页在前为-1 elif whether_around == -1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('客户信用备'): list.append(table[0:][i-1]) elif keyword.endswith('融出资金'): for i in range(0, len(table[0:]) - 1): for j in range(0, len(table[0:][i])): if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('减:减值') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('个人') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('1-3个月') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('资金'): list.append(table[0:][i]) if whether_cross_page == 2: if whether_around == 1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'): list.append(table[0:][i+1]) elif whether_around == -1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'): list.append(table[0:][i-1]) elif keyword.endswith('衍生金融工具'): for i in range(0, len(table[0:]) - 1): for j in range(0, len(table[0:][i])): if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('商品期货') or \ table[0:][i][j][0] != None and table[0:][i][j][0].endswith('生工具') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('权益\n互换') or \ table[0:][i][j][0] != None and table[0:][i][j][0].startswith('资金'): list.append(table[0:][i]) if whether_cross_page == 2: if whether_around == 1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'): list.append(table[0:][i+1]) elif whether_around == -1: if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'): list.append(table[0:][i-1]) pdf.close() #去除重复list new_list = [] for i in list: if i not in new_list: new_list.append(i) #遍历列表 for table in new_list: for row in table: row.append(bond_company) row.append(date) print(row) return bond_company
def parse_pdf(path, year): with pdfplumber.open(path) as pdf: df = pd.concat([ parse_page(page, year) for page in pdf.pages ]) return df
def get_report_startpage(pdf): """获取财务报表在文件内的起始页 Arguments: pdf {[str]} -- [pdf文件路径] Returns: start_page[int] -- [业务报表的起始页] """ getp = pdfplumber.open(pdf) total = len(getp.pages) #用于判断当前页是否在前10页 count = 0 #存储报表的起始页 start_page = 0 #是否是年度报告之类的文件标志 flag = False #创建一个pdf资源管理对象,存储共享资源 if total > 30: print('总页数', total) #遍历pdf中的每一页 for page in getp.pages: count += 1 teststr = page.extract_text() if teststr is None: return 0 #第一页有无年/季度报告文字,若没有,则无需查找起始页 rs = re.search('(年\s*|季\s*)度?\s*报\s*告?', teststr) #print(teststr) if rs != None and count == 1: #第一张找到年报相关文字,在下一页查找目录二字 flag = True continue elif rs == None and count == 1: #第一页未找到年/季报相关文字,查找第二页 #有的报告第一张具有印章,导致提取文字不全 print('第1页未检测到年/季报等文字,检测第二页') continue elif rs != None and count == 2: #第二页找到了年报相关文字,在第三页查找目录 flag = True continue elif rs == None and count == 2: #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件 if flag == False: print('当前文件的财务报表起始页为', start_page) return start_page #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页 if flag == True: #1 对前10页进行处理 if count < 11: #查找目录页 if re.search('目\s*录', teststr, flags=0): #查看含有目录两字的当前页中是否具有财务报表相关的目录名 ret = re.search('财务报告\s*(.)*\d', teststr) #??????????可能有问题lhj if ret != None: ret = ret.group() #去除空格 tstr = [ y.strip() for y in re.split(r'[…¨ .]', ret) if len(y) != 0 ] #第一个值未目录名,第二个值为页码 start_page = int(tstr[1]) print('当前文件的财务报表起始页为', start_page) return start_page else: #含有目录两字的当前页未找到财务报表相关文字,对下一页处理 count += 1 continue else: #当前页未找到目录文字,继续判断下一页 print('第', count, '页未找到目录二字,查找下一页') continue else: print('10页内未找到目录二字') #10页内未找到目录页,则退出循环 break else: #不超过30页不处理 print('当前文件的财务报表起始页为', start_page) return start_page print('当前文件的财务报表起始页为', start_page) return start_page
def test_issue_140(self): path = os.path.join(HERE, "pdfs/issue-140-example.pdf") with pdfplumber.open(path) as pdf: page = pdf.pages[0] cropped_page = page.crop((0, 0, page.width, 122)) assert len(cropped_page.extract_table()) == 5
def main(): dbs_source_dir = Path("/Users/jeromeko/Desktop/2020_Bank_Statements/DBS") uob_source_dir = Path("/Users/jeromeko/Desktop/2020_Bank_Statements/UOB") dest_csv = Path("/Users/jeromeko/Desktop/2020_Bank_Statements") dbs_all_txns = [] uob_all_txns = [] for folder, subfolder, pdf_files in os.walk(dbs_source_dir): for pdf_file in pdf_files: with pdfplumber.open(dbs_source_dir / pdf_file) as pdf: for i in range(2): # txns only extend up to 2nd page page_text = pdf.pages[i].extract_text() sub_total_bool, sub_total_content = contains_sub_total( pdf.pages[0].extract_text()) if i == 0: txns_raw = txn_trimming( page_text, "NEW TRANSACTIONS JEROME KO JIA JIN") dbs_all_txns.append( process_txn_amt(filter_legitimate_txns(txns_raw))) elif i == 1 and not sub_total_bool: # if txns extend to 2nd page txns_raw = txn_trimming(page_text, "2 of 3") dbs_all_txns.append( process_txn_amt(filter_legitimate_txns(txns_raw))) for folder, subfolder, pdf_files in os.walk(uob_source_dir): for pdf_file in pdf_files: with pdfplumber.open(uob_source_dir / pdf_file) as pdf: for i in range(2): # txns only extend up to 2nd page page_text = pdf.pages[i].extract_text() sub_total_bool, sub_total_content = contains_sub_total( pdf.pages[0].extract_text()) if i == 0: txns_raw = txn_trimming(page_text, "PREVIOUS BALANCE") uob_all_txns.append( process_txn_amt(filter_legitimate_txns(txns_raw))) elif i == 1 and not sub_total_bool: # if txns extend to 2nd page txns_raw = txn_trimming(page_text, "Date Date SGD") uob_all_txns.append( process_txn_amt(filter_legitimate_txns(txns_raw))) for monthly_txns in uob_all_txns: for txn in monthly_txns: del txn[0:2] # remove post dates all_txns = dbs_all_txns.copy() all_txns.extend(uob_all_txns) # Represent txns according to dates, desc and amt categorized_txns = [{ "Date": " ".join(txn[0:2]), "Txn Desc": " ".join(txn[2:len(txn) - 1]), "Amt": txn[-1] } for monthly_txns in all_txns for txn in monthly_txns] # Load into dataframe for further manipulation df_categorized_txns = pd.DataFrame(categorized_txns) # Format date column df_categorized_txns["Date"] = df_categorized_txns["Date"] + " 2020" # Categorizing txns df_categorized_txns["Category"] = df_categorized_txns.apply( categorize_txns, axis=1) # Write into csv # df_categorized_txns.to_csv(dest_csv / "2020 transactions.csv") df_categorized_txns.to_csv(dest_csv / "2020 transactions test.csv")
def test_issue_14(self): pdf = pdfplumber.open( os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")) assert len(pdf.objects) pdf.close()
def pdf2txt(txt_path, pdf_path, img_path): print("\n[PDF to TXT 변환 시작] 슬라이드 이미지를 텍스트로 변환을 시작합니다") # 디렉토리 유무 검사 및 디렉토리 생성 try: if not os.path.exists(txt_path): # 디렉토리 없을 시 생성 os.makedirs(txt_path) except OSError: print('Error: Creating directory. ' + txt_path) # 디렉토리 생성 오류 textt = "" textt1 = "" txt_res = "" table_final_text = [] text_com = "" table_list = [] a = 1 b = 1 Pdf = pdfplumber.open(pdf_path) for page_idx, page in enumerate(Pdf.pages): txtFile = open(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt", "w", -1, "utf-8") # 번역한 내용을 저장할 텍스트 파일 txtFile.write(str(page_idx + 1) + "번 슬라이드 해설 시작" + "\n" + "\n") # 텍스트->table result = page.extract_text() text = str(page.extract_text()) # text = text.replace('\n'," ") text = re.sub('\\n+', '\n', text) text = text + "\n" for table in page.extract_tables(): for row in table: for column in range(0, len(row)): text_com = text_com + row[column] + " " textt = str(a) + "행" textt1 = " " + str(b) + "열 " + row[column] + "\n" txt_res = txt_res + textt + textt1 b = b + 1 b = 1 a = a + 1 text_com = text_com[:-1] text_com = text_com + "\n" table_new = '표 시작\n' + txt_res + '표 끝 \n' table_final_text.append(table_new) table_list.append(text_com) # print(text_com) txt_res = "" text_com = "" a = 1 b = 1 # 공백 O for i, j in zip(table_list, table_final_text): text = text.replace(i, j) imgcaption = imgExtract(page_idx, text, pdf_path, img_path) if (imgcaption == "이미지 없음"): print("이미지 없음") txtFile.write(text + "\n") else: imgcaption = "".join(imgcaption) txtFile.write(text + imgcaption + "\n") txtFile.close() # 텍스트 변환 필터링 NLP(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt") # 이미지 캡션 위치 조정 modifytxt(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt", page_idx) print(">>>", page_idx + 1, "번째 PDF 슬라이드 텍스트 변환 완료") Pdf.close() print("[PDF to TXT 변환 종료] 슬라이드 이미지를 텍스트로 변환을 종료합니다\n")
def test_issue_33(self): pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf")) assert len(pdf.metadata.keys()) pdf.close()
import pdfplumber ms = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] for year in range(2004, 2014): for month in ms: s = '01pdf\\' + month + str(year) + 'ChiefsDirectory' with open(s + '.txt', 'w', encoding="utf-8") as fw: print('...') with pdfplumber.open(s + '.pdf') as pdf: for page in pdf.pages: fw.write(page.extract_text())
def __init__(self, path): self.pdf = pdfplumber.open(path) self.draw = False self.debug = False
import pdfplumber import pyttsx3 as speech pdf = pdfplumber.open('half.pdf') pages = pdf.pages[16:] speaker = speech.init() for i, pg in enumerate(pages): text = pages[i].extract_text() speaker.say(text) speaker.runAndWait() pdf.close()
from PIL import Image import pytesseract import sys import os import pdfplumber pdf_file = "source.pdf" txt_file = "target.txt" tempLoc = "tempPages/" pageName = "page" doc = '' pytesseract.pytesseract.tesseract_cmd ='C:/Program Files/Tesseract-OCR/tesseract.exe' with pdfplumber.open(pdf_file) as pdf: with open (txt_file, "w", encoding="utf-8") as outFile : for page in pdf.pages : # best way to get text is if the doc has the text component # already encoded (this only occurs if it was not scanned) doc = page.extract_text() # if we can't get the text the easy way, try the best we # can using OCR extraction of the text if (not doc) : im = page.to_image(resolution=512) filename = tempLoc+pageName+".png" im.save(filename, format="PNG") doc = str(((pytesseract.image_to_string(Image.open(filename))))) outFile.write (doc)
def test_pr_77(self): # via https://github.com/jsvine/pdfplumber/pull/77 path = os.path.join(HERE, "pdfs/pr-77-example.pdf") with pdfplumber.open(path) as pdf: first_page = pdf.pages[0] first_page.objects
def setup_class(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") self.pdf = pdfplumber.open(path) self.PDF_WIDTH = self.pdf.pages[0].width
def test_issue_13(self): """ Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13 """ pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf")) # Only find checkboxes this size RECT_WIDTH = 9.3 RECT_HEIGHT = 9.3 RECT_TOLERANCE = 2 def filter_rects(rects): ## Just get the rects that are the right size to be checkboxes rects_found = [] for rect in rects: if (rect['height'] > (RECT_HEIGHT - RECT_TOLERANCE) and (rect['height'] < RECT_HEIGHT + RECT_TOLERANCE) and (rect['width'] < RECT_WIDTH + RECT_TOLERANCE) and (rect['width'] < RECT_WIDTH + RECT_TOLERANCE)): rects_found.append(rect) return rects_found def determine_if_checked(checkbox, curve_list): # This figures out if the bounding box of (either) line used to make # one half of the 'x' is the right size and overlaps with a rectangle. # This isn't foolproof, but works for this case. # It's not totally clear (to me) how common this style of checkboxes # are used, and whether this is useful approach to them. # Also note there should be *two* matching LTCurves for each checkbox. # But here we only test there's at least one. for curve in curve_list: if (checkbox['height'] > (RECT_HEIGHT - RECT_TOLERANCE) and (checkbox['height'] < RECT_HEIGHT + RECT_TOLERANCE) and (checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE) and (checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE)): xmatch = False ymatch = False if (max(checkbox['x0'], curve['x0']) <= min( checkbox['x1'], curve['x1'])): xmatch = True if (max(checkbox['y0'], curve['y0']) <= min( checkbox['y1'], curve['y1'])): ymatch = True if xmatch and ymatch: return True return False p0 = pdf.pages[0] curves = p0.objects["curve"] rects = filter_rects(p0.objects["rect"]) n_checked = sum([determine_if_checked(rect, curves) for rect in rects]) assert (n_checked == 5) pdf.close()
import json PATH_TO_FILE = "/Users/healthi/Downloads/HARSHITHA.PDF" END_OF_PAGE = "---End Of Report---" def pretty(d, indent=0): for key, value in d.items(): print('\t' * indent + str(key)) if isinstance(value, dict): pretty(value, indent + 1) else: print('\t' * (indent + 1) + str(value)) pdf = pdfplumber.open(PATH_TO_FILE) #p0 = pdf.pages[4] #works page_to_parse = [8, 9] final_result = {} header_name = [ "CREATININE, SERUM", "LIPID PROFILE", "LIVER FUNCTION TESTS (LFT)", "UREA - SERUM / PLASMA" ] test_name = [ "CREATININE-SERUM/PLASMA", "CHOLESTEROL", "HDL", "TRIGLYCERIDES", "LDL", "VLDL", "TOTALCHOLESTEROLHDLCHOLESTEROLRATIO", "BILIRUBINTOTAL", "ALBUMIN", "A/GRatio", "AST(SGOT)", "ALT(SGPT)", "ALKALINEPHOSPHATASE", "GAMMAGLUTAMYLTRANSPEPTIDASE", "BILIRUBINCONJUGATED(DIRECT)", "UREA,SERUM", "URICACID-SERUM" ] header = ""
def test_issue_53(self): pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-53-example.pdf")) assert len(pdf.objects) pdf.close()
if len(filtered.chars) == 0: continue tops = [c["top"] for c in filtered.chars] cropped = filtered.crop((0, min(tops) - 2, filtered.width, max(tops) + 6)) rows = cropped.extract_table(x_tolerance=1, y_tolerance=1) table += rows data = pd.DataFrame(table) if len(data.columns) == 6: data.columns = COLUMNS else: data.columns = ["sivigila_code"] + COLUMNS data = data.drop_duplicates().reset_index(drop=True) data[INT_COLS] = data[INT_COLS].astype(int) data["department"] = data["department"].str.strip().str.upper().apply(lambda x: DEPT_FIXES.get(x, x)) data["sivigila_code"] = data["sivigila_code"].str.strip() data["municipality"] = data["municipality"].str.strip().str.upper().apply(lambda x: MUNI_FIXES.get(x, x)) sums = data[INT_COLS].sum(axis=1) equalities = (sums == (data["zika_total"] * 2)).unique().tolist() assert equalities == [True] return data if __name__ == "__main__": import sys with pdfplumber.open(sys.argv[1]) as pdf: data = parse(pdf) data.to_csv(sys.stdout, index=False, encoding="utf-8")
def test_issue_67(self): pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-67-example.pdf")) assert len(pdf.metadata.keys()) pdf.close()
def test_issue_90(self): path = os.path.join(HERE, "pdfs/issue-90-example.pdf") with pdfplumber.open(path) as pdf: page = pdf.pages[0] page.extract_words()
def test_pr_136(self): path = os.path.join(HERE, "pdfs/pr-136-example.pdf") with pdfplumber.open(path) as pdf: page = pdf.pages[0] page.extract_words()
#!/usr/bin/env python # Note: Some Python best-practices have been sacrificed below for simplicity's sake. import pdfplumber import sys, os COLUMNS = [ "state", "permit", "handgun", "long_gun", "other", "multiple", "admin", "prepawn_handgun", "prepawn_long_gun", "prepawn_other", "redemption_handgun", "redemption_long_gun", "redemption_other", "returned_handgun", "returned_long_gun", "returned_other", "rentals_handgun", "rentals_long_gun", "private_sale_handgun", "private_sale_long_gun", "private_sale_other", "return_to_seller_handgun", "return_to_seller_long_gun", "return_to_seller_other", "totals" ] pdf_path = os.path.join(sys.argv[1]) pdf = pdfplumber.open(pdf_path) first_page = pdf.pages[0] cropped = first_page.crop((0, 80, first_page.width, 485)) table = cropped.extract_table( v="lines", h="gutters", x_tolerance=5, y_tolerance=5 ) print("\t".join(COLUMNS)) for row in table: cols = [ (row[i] or "") for i in range(len(COLUMNS)) ] print("\t".join(cols).replace(",", ""))
def test_issue_13(self): """ Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13 """ pdf = pdfplumber.open( os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf")) # Only find checkboxes this size RECT_WIDTH = 9.3 RECT_HEIGHT = 9.3 RECT_TOLERANCE = 2 def filter_rects(rects): # Just get the rects that are the right size to be checkboxes rects_found = [] for rect in rects: if (rect["height"] > (RECT_HEIGHT - RECT_TOLERANCE) and (rect["height"] < RECT_HEIGHT + RECT_TOLERANCE) and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE) and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE)): rects_found.append(rect) return rects_found def determine_if_checked(checkbox, checklines): """ This figures out if the bounding box of (either) line used to make one half of the 'x' is the right size and overlaps with a rectangle. This isn't foolproof, but works for this case. It's not totally clear (to me) how common this style of checkboxes are used, and whether this is useful approach to them. Also note there should be *two* matching LTCurves for each checkbox. But here we only test there's at least one. """ for cl in checklines: if (checkbox["height"] > (RECT_HEIGHT - RECT_TOLERANCE) and (checkbox["height"] < RECT_HEIGHT + RECT_TOLERANCE) and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE) and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)): xmatch = False ymatch = False if max(checkbox["x0"], cl["x0"]) <= min( checkbox["x1"], cl["x1"]): xmatch = True if max(checkbox["y0"], cl["y0"]) <= min( checkbox["y1"], cl["y1"]): ymatch = True if xmatch and ymatch: return True return False p0 = pdf.pages[0] checklines = [ line for line in p0.lines if round(line["height"], 2) == round(line["width"], 2) ] # These are diagonals rects = filter_rects(p0.objects["rect"]) n_checked = sum( [determine_if_checked(rect, checklines) for rect in rects]) assert n_checked == 5 pdf.close()
elif index > 2: for row in data: if row[4] is not None: writer.writerow([row[2],row[4],row[6],row[8],row[10],row[12],row[14]]) else: for row in data: writer.writerow(row) jobs_lost = 0 #Loop through each PDF file for index, doc in enumerate(all_pdfs): master_table = [] pdf = pdfplumber.open(doc) fname = doc[:len(doc) - 3] + 'csv' # Check if this is the first of three files whose # format is the same. Otherwise, use other format for i in range(len(pdf.pages)): page = pdf.pages[i] table = page.extract_table() for row in table: if index <= 2: master_table.append(row) elif index is not 4: if row[4] is not None: master_table.append(row)
url = 'http://www2.illinoisworknet.com/DownloadPrint/December%202015%20Monthly%20WARN%20Report.pdf' pdf_fname = 'ILWARN-' + basename(url) print("Downloading", url, 'into', pdf_fname) resp = requests.get(url) with open(pdf_fname, 'wb') as f: f.write(resp.content) from glob import glob pdf_filename = glob('ILWARN-*.pdf') for pdf_fname in pdf_filename: print("This is a filename of a pdf:", pdf_fname) import csv import pdfplumber outfile = open('ILWARN.csv', 'w') outcsv = csv.writer(outfile) pdf_fnameDEC = 'ILWARN-December%202015%20Monthly%20WARN%20Report.pdf' pdf = pdfplumber.open(pdf_fnameDEC) page = pdf.pages for page in pdf.pages: table = page.extract_table() for row in table[1:]: outcsv.writerow(row) outfile.close