def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table({ "horizontal_strategy": "text", "explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))], "intersection_tolerance": 5, }) def parse_value(k, x): if k == 0: return x if x in (None, ""): return None return int(x.replace(",", "")) def parse_row(row): return dict( (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert colsum == (total * 2) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = extract_text(month_chars) assert month_text == "November - 2015"
def extract_text(self, x_tolerance=utils.DEFAULT_X_TOLERANCE, y_tolerance=utils.DEFAULT_Y_TOLERANCE): return utils.extract_text(self.chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance)
def test_extract_text_layout(self): target = open( os.path.join(HERE, "comparisons/scotus-transcript-p1.txt")).read() page = self.pdf_scotus.pages[0] text = page.extract_text(layout=True) utils_text = utils.extract_text(page.chars, layout=True) assert text == utils_text assert text == target
def extract(self, x_tolerance=utils.DEFAULT_X_TOLERANCE, y_tolerance=utils.DEFAULT_Y_TOLERANCE): chars = self.page.chars table_arr = [] def char_in_bbox(char, bbox): v_mid = (char["top"] + char["bottom"]) / 2 h_mid = (char["x0"] + char["x1"]) / 2 x0, top, x1, bottom = bbox return ((h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)) for row in self.rows: arr = [] row_chars = [ char for char in chars if char_in_bbox(char, row.bbox) ] for cell in row.cells: if cell == None: cell_text = None else: cell_chars = [ char for char in row_chars if char_in_bbox(char, cell) ] if len(cell_chars): self.used_chars += cell_chars cell_text = utils.extract_text( cell_chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance).strip() else: cell_text = "" arr.append(cell_text) table_arr.append(arr) return table_arr
def parse_page(page): month_chars = [ c for c in page.chars if c["non_stroking_color"] == (1, 0, 0) ] month_text = extract_text(month_chars, x_tolerance=2) month = parse_month(month_text) sys.stderr.write("\r" + month) table_crop = page.crop(( 0, [ w for w in page.extract_words() if w["text"] == "State" ][0]["bottom"], page.width, page.rects[-1]["bottom"], )) edge_xs = list(set(map(itemgetter("x0"), table_crop.edges))) leftmost_char = min(map(itemgetter("x0"), table_crop.chars)) _table = table_crop.extract_table({ "horizontal_strategy": "text", "vertical_strategy": "explicit", "explicit_vertical_lines": [ leftmost_char ] + edge_xs, "intersection_tolerance": 5, "text_y_tolerance": 0, "text_x_tolerance": 2, }) table = pd.DataFrame([ [ month ] + row for row in _table ]) table.columns = COLUMNS table[table.columns[2:]] = table[table.columns[2:]].applymap(parse_value) table.loc[(table["state"] == "llinois"), "state"] = "Illinois" table = table.loc[lambda df: df["state"].fillna("").str.strip() != ""] try: validate_data(table) except: raise Exception("Invalid data for " + month) return table
def extract_text(self, x_tolerance=0, y_tolerance=0): return utils.extract_text(self.chars, x_tolerance=x_tolerance, y_tolerance=y_tolerance)