def test_issue_13(self): """ Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13 """ pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf") ) # Only find checkboxes this size RECT_WIDTH = 9.3 RECT_HEIGHT = 9.3 RECT_TOLERANCE = 2 def filter_rects(rects): ## Just get the rects that are the right size to be checkboxes rects_found = [] for rect in rects: if ( rect['height'] > ( RECT_HEIGHT - RECT_TOLERANCE ) and ( rect['height'] < RECT_HEIGHT + RECT_TOLERANCE) and ( rect['width'] < RECT_WIDTH + RECT_TOLERANCE) and ( rect['width'] < RECT_WIDTH + RECT_TOLERANCE) ): rects_found.append(rect) return rects_found def determine_if_checked(checkbox, curve_list): # This figures out if the bounding box of (either) line used to make # one half of the 'x' is the right size and overlaps with a rectangle. # This isn't foolproof, but works for this case. # It's not totally clear (to me) how common this style of checkboxes # are used, and whether this is useful approach to them. # Also note there should be *two* matching LTCurves for each checkbox. # But here we only test there's at least one. for curve in curve_list: if ( checkbox['height'] > ( RECT_HEIGHT - RECT_TOLERANCE ) and ( checkbox['height'] < RECT_HEIGHT + RECT_TOLERANCE) and ( checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE) and ( checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE) ): xmatch = False ymatch = False if ( max(checkbox['x0'], curve['x0']) <= min(checkbox['x1'], curve['x1']) ): xmatch = True if ( max(checkbox['y0'], curve['y0']) <= min(checkbox['y1'], curve['y1']) ): ymatch = True if xmatch and ymatch: return True return False p0 = pdf.pages[0] curves = p0.objects["curve"] rects = filter_rects(p0.objects["rect"]) n_checked = sum([ determine_if_checked(rect, curves) for rect in rects ]) assert(n_checked == 5)
def test_rotation(self): rotated = pdfplumber.from_path( os.path.join(HERE, "pdfs/nics-background-checks-2015-11-rotated.pdf") ) assert(self.pdf.pages[0].width == 1008) assert(self.pdf.pages[0].height == 612) assert(rotated.pages[0].width == 612) assert(rotated.pages[0].height == 1008) assert(rotated.pages[0].cropbox == self.pdf.pages[0].cropbox) assert(rotated.pages[0].bbox != self.pdf.pages[0].bbox)
def extract(pdf_path: str, filter=None, flavor='lattice', lang: str = 'eng', **imgOcrSettings): ''' : 抽取pdf中的表格数据 ''' pdf = pdfplumber.from_path(pdf_path) total_page = len(pdf.pages) tables: [PageTable] = [] # 使用camelot抽取表格 print('use camelot extract tables') camelot_tables = camelot.read_pdf(pdf_path, pages='all', flavor=flavor, suppress_stdout=False) for t in camelot_tables: text = pdf.pages[t.page - 1].extract_text() merge_table(tables, t.page, t.data, text) # 如果抽取完成则返回 if len(tables) == total_page: return tables # 否则使用ocr抽取其他页面的表格 extract_pages = [t.page for t in tables] total_page_set = set(range(1, total_page + 1)) extracted_pages_set = set(extract_pages) other_pages = list(total_page_set.difference(extracted_pages_set)) for page_number in other_pages: other_tables = extract_imgbase( pdf, page_number, flavor, lang, filter, **imgOcrSettings) if other_tables is not None: merge_tables(tables, page_number, other_tables) pdf.close() return tables
def test_issue_33(self): pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf") ) assert len(pdf.metadata.keys())
def test_issue_21(self): pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf") ) assert len(pdf.objects)
def test_issue_14(self): pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") ) assert len(pdf.objects)
def setUp(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") self.pdf = pdfplumber.from_path(path)
def setUp(self): print('i am running') path = os.path.join('/home/sxs/yuhsuan/datasets/personal/003.pdf') self.pdf = pdfplumber.from_path(path) self.PDF_WIDTH = self.pdf.pages[0].width
def setUp(self): path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf") self.pdf = pdfplumber.from_path(path) self.PDF_WIDTH = self.pdf.pages[0].width
def setUp(self): path = os.path.join(HERE, "pdfs/WARN-Report-for-7-1-2015-to-03-25-2016.pdf") self.pdf = pdfplumber.from_path(path) self.PDF_WIDTH = self.pdf.pages[0].width
def setUp(self): path = os.path.join(HERE, "pdfs/la-precinct-bulletin-2014-p1.pdf") self.pdf = pdfplumber.from_path(path) self.PDF_WIDTH = self.pdf.pages[0].width
def test_load(self): path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") pdf = pdfplumber.from_path(path)
def test_issue_53(self): pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/issue-53-example.pdf") ) assert len(pdf.objects)
def test_issue_67(self): pdf = pdfplumber.from_path( os.path.join(HERE, "pdfs/issue-67-example.pdf") ) assert len(pdf.metadata.keys())