def extract_other_creditors_ef(page: pdfplumber.pdf.Page, start: Dict, stop: Dict, creditors: List) -> Dict: """Process other creditors to be notified if any :param page:Page to crop :param start:Y coordinate of the top of the creditor section :param stop:Y coordinate of the bottom of the creditor section :return: The key, address and acct information """ key_bbox = (start["x0"], start["top"] - 20, start["x1"], start["top"]) addy_bbox = (0, start["top"] - 20, start["x0"] - 20, stop["top"]) acct_bbox = (start["x1"] + 150, start["top"] + 20, page.width, stop["top"]) key = page.crop(key_bbox).filter(just_text_filter).extract_text() address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text() acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text() for creditor in creditors: if creditor["key"] == str(key): other_creditors = creditor["other_creditors"] other_creditors.append({ "address": address, "acct": acct, "key": key }) creditor["other_creditors"] = other_creditors return creditors
def crop_and_extract( page: pdfplumber.pdf.Page, line: Dict, adjust=False, left_shift: int = 0, up_shift: int = 20, ) -> str: """Extract text content for pdf line if any Given a line of a pdf - extracted the text around it. If adjust is True, reduce the cropped area to within the first line (usually above) :param page: Page to crop :param line: Line to crop around :param adjust: Whether to check if another line is inside our crop :param left_shift: Leftward crop adjustment :param up_shift: Upward crop adjustment :return: Content of the section """ bbox = ( int(line["x0"]) - left_shift, int(line["top"]) - up_shift, line["x1"], line["top"], ) crop = page.crop(bbox) if adjust: tops = [row["top"] for row in crop.lines if row["top"] != line["top"]] if len(tops) > 0: crop = page.crop(bbox=(*bbox[:1], tops[-1], *bbox[2:])) return crop.filter(keys_and_input_text).extract_text()
def find_property_sections( only_page: pdfplumber.pdf.Page, ) -> Optional[Iterator[Tuple[Union[int, Any], int, Union[int, Any]]]]: """Find property sections to iterate over Find Sections with white font and identifiers for each section :param only_page: PDF page to crop :return: None or Iterable sections as top bottom and id """ rows = only_page.filter(input_white_text_and_left_side).extract_words() rows = [{ "top": int(row["top"]), "text": row["text"] } for row in rows if len(row["text"]) > 2 and row["text"][0] in "P12345" and row["text"][1] == "."] if len(rows) == 0: return None bottoms = [ int(line["top"]) for line in only_page.lines if line["top"] > rows[0]["top"] and line["width"] > 530 ][:len(rows)] tops = [r["top"] for r in rows] keys = [r["text"] for r in rows] return zip(tops, bottoms, keys)
def parse_unsecured_creditors(page: pdfplumber.pdf.Page, top: int, bottom: int) -> Dict: """Extract the information on the unsecured creditor section :param page: PDF page :param top: Y coordinate of the top of section :param bottom: Y coordinate of the bottom of section :return: Organized creditor data """ data = [] crop_one = page.crop((0, max(100, top - 500), page.width, bottom)) crop = crop_one.crop((0, top, page.width, bottom)) key = crop.filter(key_filter).extract_text().replace("\n", "") boxes = get_checkboxes(crop) lines = crop.filter(remove_margin_lines).lines for line in sorted(lines, key=lambda x: x["top"]): if not data and line["width"] > 20: continue output = crop_and_extract(crop_one, line, adjust=True, up_shift=100) if data or (output is not None and key == output.replace("\n", "")): if len(data) == 10 and "2." in key or len( data) == 8 and "4." in key: continue data.append(output) if data: return make_creditor_dict(data, boxes, key) return {}
def extract_other_creditors_d(page: pdfplumber.pdf.Page, markers: List[Dict], creditors: List) -> None: """Crop and extract address, key and acct # from the PDf :param page: PDF page :param markers: The top and bottom markers :return: Address, key and account information """ adjust = 0 if len(markers) == 5 else 12 addy_bbox = ( 0, markers[0]["top"], int(markers[-1]["x1"]) * 0.35, markers[-1]["top"], ) key_bbox = ( markers[-3]["x0"], markers[0]["top"] - adjust, markers[-3]["x1"], markers[-3]["top"], ) acct_bbox = ( markers[1]["x0"], markers[1]["top"] - 12, markers[1]["x1"], markers[1]["top"], ) address = page.crop(addy_bbox).filter(keys_and_input_text).extract_text() key = page.crop(key_bbox).filter( keys_and_input_text).extract_text().strip() acct = page.crop(acct_bbox).filter(keys_and_input_text).extract_text() for creditor in creditors: if creditor["key"] == key: other_creditors = creditor["other_creditors"] other_creditors.append({ "key": key, "address": address, "acct": acct }) creditor["other_creditors"] = other_creditors return creditors
def get_3_to_8_form_a_b( page: pdfplumber.pdf.Page, ) -> Tuple[List[Union[Dict[str, Any], Dict[Optional[Any], str]]], list, Optional[dict]]: """Parse sections 3 to 8 of 106 A/B property form :param page:The pdf page to parse :return:Organized property data in the document. """ part = 0 totals, section, key = None, None, None results, data, part_eight = [], [], [] rows = page.filter(filter_106_ab_content).extract_text().splitlines() debtors = get_ab_debtors(rows) # Remove debtor rows from lines rows = [r for r in rows if r not in debtors] for debtor in debtors: rows = [r for r in rows if debtor not in r] for row in rows[1:]: match = re.match(r"Part \d:", row) if match: part += 1 continue # Extract parts 3 to 7 if part in [3, 4, 5, 6, 7]: match = re.match(r"^\d{1,2}\. ?|^5", row) if not match: data.append(row) continue if section == row: continue if "54. " in row: results.append({"54.": row.split(" ")[1]}) if key: data = [d for d in data if "[" not in d] if data: if key == "24." and data == ["2"]: data = [] continue results.append({key: clean_ab_data(data)}) data = [] section = row key = row if part == 8: # Part 8 is the section containing grand totals. part_eight.append(row) if "63. " in row: # this is the final row of Part 8 totals = make_ab_totals(part_eight) return results, debtors, totals
def get_1_to_2_from_a_b(only_page: pdfplumber.pdf.Page) -> List[Dict]: """Extract real estate, automobile, jet ski, boats etc, from A/B. :param only_page:The PDF page to extract from :return: Extracted content """ property_content = [] sections = find_property_sections(only_page) if not sections: return property_content for top, bottom, key in sections: bbox = (0, top, only_page.width, bottom) crop = only_page.crop(bbox) data = get_all_values_from_crop(crop.lines, only_page) if "1." in key: section = make_property_dict(key, data) checkboxes = get_checkboxes(crop) if not checkboxes: section["property_interest"] = "Checkbox unreadable" section["debtor"] = "Checkbox unreadable" else: section["property_interest"] = checkboxes["property"] section["debtor"] = checkboxes["debtor"] property_content.append(section) if "3." in key or "4." in key: if "3." in key: section = make_car_dict(key, data) else: section = make_other_dict(key, data) checkboxes = get_checkboxes(crop) if not checkboxes: section["debtor"] = "Checkbox unreadable" else: section["debtor"] = checkboxes["debtor"] property_content.append(section) return property_content
def parse_secured_creditors(only_page: pdfplumber.pdf.Page, top: int, bottom: int) -> Dict: """Find and extract content from secured creditor portion of 106D :param only_page:PDF page :param top: Y coordinate for top of section :param bottom: Y coordinate of bottom of section :return: Organized data of the section """ page = only_page.crop((0, max(100, top - 500), only_page.width, bottom)) section = page.crop((0, top, only_page.width, bottom)) key = section.filter(key_filter).extract_text() checkboxes = get_checkboxes(section) data = [] for line in sorted(section.filter(remove_margin_lines).lines, key=lambda x: x["top"]): top = int(line["top"]) if not data and line["width"] > 20: continue page_crop = page.crop((line["x0"], top - 200, line["x1"], top)) tops = [ row["top"] for row in page_crop.lines if int(row["top"]) != top ] if len(tops) > 0: if len(data) == 6: page_crop = page.crop( (line["x0"], tops[-1] - 20, line["x1"], top)) elif len(data) == 8: page_crop = page.crop((line["x0"], top - 50, line["x1"], top)) else: page_crop = page.crop( (line["x0"], tops[-1], line["x1"], line["top"])) output = page_crop.filter(keys_and_input_text).extract_text() if data or key == output: data.append(output) if data and len(data) > 10: return make_secured_creditor_dict(data, checkboxes) return {}
def get_checkboxes(crop: pdfplumber.pdf.Page) -> Dict: """Find and identify checked checkboxes Using multiple tolerances, find checkboxes and identify them by the content to the right of the checkbox. :param crop: Section of pdf to extract checkboxes from :return: Dictionary of selected checkboxes """ results = {} # Use multiple tolerances to line up checkboxes on weird PDFs for tolerance in [3, 4, 5]: filtered_data = crop.filter(filter_boxes).extract_text( y_tolerance=tolerance) filtered_data = filtered_data.replace( "Type of NONPRIORITY unsecured claim:", "") if "[]" not in filtered_data: # Checkboxes unreadable return {} filtered_lines = filtered_data.splitlines() checkboxes = [x.replace(" ", " ") for x in filtered_lines if "[" in x] query1 = ["debtor"] query2 = ["community", "see instructions", "claim relates"] query3 = ["No", "Yes"] query4 = ["contingent", "unliquidated", "disputed"] query5 = [ "domestic", "taxes", "death", "specify", "loans", "obligations", "pension", "including", "judgment", "statutory", "agreement", ] debtor = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query1) ] community = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query2) ] offset = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box for s in query3) ] offset = [ans for ans in offset if re.match(r"^(Yes|No)$", ans)] info = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query4) ] claim_type = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box.lower() for s in query5) ] property_values = [ box.split(" ", 1)[1].strip() for box in checkboxes if "√" in box and any(s in box for s in property_options) ] property_values = [ s for s in property_options if any(s in box for box in property_values) ] if claim_type: if "Specify" in claim_type[0]: claim_type = ["Other. Specify"] data = { "debtor": debtor, "community": community, "offset": offset, "info": info, "claim_type": claim_type, "property": property_values, } if not results: results = data else: datum = [{k: v} for k, v in data.items() if v != []] data = {} for item in datum: data = {**data, **item} results = {**results, **data} return results