def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table({ "horizontal_strategy": "text", "explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))], "intersection_tolerance": 5 }) table = pd.DataFrame(table) def parse_value(x): if pd.isnull(x) or x == "": return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap( parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars) assert (month_text == "November - 2015")
def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) _table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) table = pd.DataFrame(_table) def parse_value(x): if pd.isnull(x): return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap(parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert colsum == (total * 2) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert month_text == "November - 2015"
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table({ "horizontal_strategy": "text", "explicit_vertical_lines": [min(map(itemgetter("x0"), cropped.chars))], "intersection_tolerance": 5 }) print(table) def parse_value(k, x): if k == 0: return x if x in (None, ""): return None return int(x.replace(",", "")) def parse_row(row): return dict( (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars) assert (month_text == "November - 2015")
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) def parse_value(k, x): if k == 0: return x if x == None: return None return int(x.replace(",", "")) def parse_row(row): return dict( (COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert (month_text == "November - 2015")
def test_pandas(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) _table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) table = pd.DataFrame(_table) def parse_value(x): if pd.isnull(x): return None return int(x.replace(",", "")) table.columns = COLUMNS table[table.columns[1:]] = table[table.columns[1:]].applymap( parse_value) # [1:] because first column is state name for c in COLUMNS[1:]: total = table[c].iloc[-1] colsum = table[c].sum() assert (colsum == (total * 2)) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert (month_text == "November - 2015")
def precinct(self): h1_left = list(self.bboxes["h1"]) h1_left[-2] = float(h1_left[-2]) / 2 h1_left_chars = within_bbox(self.chars, h1_left) txt = h1_left_chars.groupby("top").apply(_collate_chars).iloc[-1] p_id = "|".join(re.split(r"\s{2,}", txt)[1:3]) return p_id
def objects(self): if hasattr(self, "_objects"): return self._objects if self.strict: kwargs = {"strict": True} else: kwargs = {"crop": True} self._objects = utils.within_bbox(self.parent_page.objects, self.bbox, **kwargs) return self._objects
def results(self): r = [] for col in ["c1", "c2", "c3", "c4"]: b = within_bbox(self.chars, self.bboxes[col]) r += self.parse_col(b) return r
def registered_voters(self): h2_chars = within_bbox(self.chars, self.bboxes["h2"]) txt = h2_chars.groupby("top").apply(collate_chars).iloc[1] return int(re.match(r"(\d+) REGISTERED VOTERS", txt).group(1))
def ballots_cast(self): h2_chars = within_bbox(self.chars, self.bboxes["h2"]) txt = h2_chars.groupby("top").apply(collate_chars).iloc[0] return int(re.match(r"(\d+) BALLOTS CAST", txt).group(1))
def registered_voters(self): h2_chars = within_bbox(self.chars, self.bboxes["h2"]) txt = h2_chars.groupby("top").apply(_collate_chars).iloc[1] return int(re.match(r"(\d+) REGISTERED VOTERS", txt).group(1))
def ballots_cast(self): h2_chars = within_bbox(self.chars, self.bboxes["h2"]) txt = h2_chars.groupby("top").apply(_collate_chars).iloc[0] return int(re.match(r"(\d+) BALLOTS CAST", txt).group(1))
def test_plain(self): page = self.pdf.pages[0] cropped = page.crop((0, 80, self.PDF_WIDTH, 485)) table = cropped.extract_table(h="gutters", x_tolerance=5, y_tolerance=5, gutter_min_height=5) def parse_value(k, x): if k == 0: return x if x == None: return None return int(x.replace(",", "")) def parse_row(row): return dict((COLUMNS[i], parse_value(i, v)) for i, v in enumerate(row)) parsed_table = [parse_row(row) for row in table] # [1:] because first column is state name for c in COLUMNS[1:]: total = parsed_table[-1][c] colsum = sum(row[c] or 0 for row in parsed_table) assert colsum == (total * 2) month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65)) month_text = collate_chars(month_chars, x_tolerance=2) assert month_text == "November - 2015"