def try_tesco_bank(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("tesco_bank") # Before checking for statements, check other communications. if text_boxes[0].startswith("Tesco Bank\n") and any( box.startswith("Annual Summary of Interest\n") for box in text_boxes ): assert "Minicom:" in text_boxes[2] account_holder_name = text_boxes[4].strip() tax_year_line = [box for box in text_boxes if box.startswith("Tax Year:")] assert len(tax_year_line) == 1 tax_year_match = re.search( r"^Tax Year: [0-9]{1,2} [A-Z][a-z]+ [0-9]{4} to ([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n$", tax_year_line[0], ) assert tax_year_match document_date = dateparser.parse(tax_year_match.group(1)) return NameComponents( document_date, "Tesco Bank", account_holder_name, "Annual Summary of Interest", ) if not any("tescobank.com/mmc" in box for box in text_boxes): return None assert "Current Account\n" in text_boxes[0] if text_boxes[1] == "Monthly statement\n": document_type = "Statement" else: document_type = text_boxes[1].strip().title() account_holder_name = extract_account_holder_from_address(text_boxes[2]) fields_box = text_boxes[3] values_box = text_boxes[4] statement_info = build_dict_from_fake_table(fields_box, values_box) statement_date = dateparser.parse( statement_info["Statement date:"], languages=["en"] ) return NameComponents( statement_date, "Tesco Bank", account_holder_name, document_type, )
def try_americanexpress(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("americanexpress") if text_boxes[0] != "www.americanexpress.co.uk\n": return None document_type = text_boxes[4].strip() if document_type == "Statement of Account": document_type = "Statement" account_holder_box = find_box_starting_with(text_boxes, "Prepared for\n") assert account_holder_box account_holder_index = text_boxes.index(account_holder_box) account_holder_name = account_holder_box.split("\n")[1].strip().title() # The date is the box after the Membership Number. We can't look for the one starting # with "Date" because there's more than one. membership_box = find_box_starting_with(text_boxes, "Membership Number\n") assert membership_box membership_index = text_boxes.index(membership_box) date_box = text_boxes[membership_index + 1] date_fields = date_box.split("\n") assert date_fields[0] == "Date" statement_date = datetime.datetime.strptime(date_fields[1], "%d/%m/%y") return NameComponents( statement_date, "American Express", account_holder_name, "Statement", )
def try_soenergy(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("soenergy") is_soenergy = any(box == "www.so.energy\n" for box in text_boxes) if not is_soenergy: return None assert text_boxes[1] == "Hello, here is your statement.\n" # Find the account holder name at the start of the PDF. address_box = text_boxes[0] account_holder_name = extract_account_holder_from_address(address_box) period_line = text_boxes[2] logger.debug("found period specification: %r", period_line) period_match = re.match( r"^For the period of [0-9]{1,2} [A-Z][a-z]{2} [0-9]{4} - ([0-9]{1,2} [A-Z][a-z]{2} [0-9]{4})\n$", period_line, ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents( statement_date, "So Energy", account_holder_name, "Statement", )
def try_thameswater(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("thameswater") # There are at least two different possible boxes as the bottom of page 1 since 2017, # but they all include a link to TW's website. if "thameswater.co.uk/" not in text_boxes[-1]: return None assert text_boxes[0].startswith("Page 1 of ") date_line = text_boxes[1] date_match = re.search("^Date\n([0-9]{1,2} [A-Z][a-z]+ [0-9]{4})\n", date_line) assert date_match document_date = dateparser.parse(date_match.group(1), languages=["en"]) address_box = text_boxes[5] account_holder_name = extract_account_holder_from_address(address_box) document_subject = text_boxes[7] if ( document_subject == "Your payment plan.\n" or document_subject == "Your new payment plan.\n" ): document_type = "Payment Plan" elif document_subject == "Your water and wastewater bill.\n": document_type = "Bill" else: document_type = "Other" return NameComponents( document_date, "Thames Water", account_holder_name, document_type, )
def try_ms_bank(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("ms_bank") if "M&S Bank" not in text_boxes[-1]: return None account_name_box = find_box_starting_with(text_boxes, "Account Name\n") assert account_name_box account_holder_name = account_name_box.split("\n")[1].strip() # The statement period is just before the account name box. period_box_index = text_boxes.index(account_name_box) - 1 period_line = text_boxes[period_box_index] logger.debug("found period specification %r", period_line) period_match = re.search( r"^[0-9]{2} [A-Z][a-z]+(?: [0-9]{4})? to ([0-9]{2} [A-Z][a-z]+ [0-9]{4})\n$", period_line, ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents(statement_date, "M&S Bank", account_holder_name, "Statement",)
def try_aws(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: is_aws = find_box_starting_with(text_boxes, "Amazon Web Services, Inc. Invoice\n") if not is_aws: is_aws = find_box_starting_with(text_boxes, "Amazon Web Services Invoice\n") if not is_aws: return None fields_box = find_box_starting_with(text_boxes, "Invoice Number:\n") assert fields_box fields_index = text_boxes.index(fields_box) # There's at least two versions of this, where the fields are either right after, or # once more after that. Try both. values_box = text_boxes[fields_index + 1] if fields_box.count("\n") != values_box.count("\n"): values_box = text_boxes[fields_index + 2] invoice_info = build_dict_from_fake_table(fields_box, values_box) invoice_date = dateparser.parse(invoice_info["Invoice Date:"], languages=["en"]) address_box = find_box_starting_with(text_boxes, "Bill to Address:\n") assert address_box account_holder = address_box.split("\n")[1] assert account_holder.startswith("ATTN: ") account_holder = account_holder[6:] # Drop the ATTN return NameComponents(invoice_date, "AWS", account_holder, "Invoice")
def try_chase(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("chase") if not find_box_starting_with(text_boxes, "JPMorgan Chase Bank, N.A.\n"): return None # Period line changes from statement to statement, so try fuzzy-matching it instead. # Note that some more recent statements appear to have spacing issues, so we can't # match the space both sides. for box in text_boxes: period_match = re.search( r"^[A-Z][a-z]+ [0-9]{1,2}, [0-9]{4} ?through ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", box, ) if period_match: break else: logger.debug("unable to find period line") return None assert period_match logger.debug("found period specification: %r", period_match.group(0)) statement_date = dateparser.parse(period_match.group(1), languages=["en"]) # We anchor the address on the contact numbers on the side, but that's not working for # older statements. deaf_contact_box = find_box_starting_with(text_boxes, "Deaf and Hard of Hearing: ") if deaf_contact_box: deaf_contact_index = text_boxes.index(deaf_contact_box) account_holder_box = text_boxes[deaf_contact_index + 1] account_holder_name = account_holder_box.strip().title() else: # If we couldn't find the account holder through the contact number, it probably is a newer version of the template. # We can find the address box based on the period line instead. period_box = find_box_starting_with(text_boxes, period_match.group(0)) address_box_index = text_boxes.index(period_box) - 1 address_box = text_boxes[address_box_index] if address_box.count("\n") < 2: logger.debug("unable to find the account holder name") return None # Here's another corner case: when the statement has communications attached in # the first page, the mail routing number is attached to the address. So instead, # we need to drop that ourselves. if re.search(r"^[0-9]+ [A-Z]+ ", address_box): address_box = address_box.split("\n", 1)[1] account_holder_name = extract_account_holder_from_address(address_box) return NameComponents( statement_date, "Chase", account_holder_name, "Statement", )
def try_enel(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("enel") enel_address_box = find_box_starting_with( text_boxes, "Enel Energia - Mercato libero dell'energia\n" ) if not enel_address_box: return None enel_address_index = text_boxes.index(enel_address_box) # Late 2019: the ENEL address is at the beginning, the address is two boxes before the # payment due date. due_date_box = find_box_starting_with(text_boxes, "Entro il ") assert due_date_box address_box_index = text_boxes.index(due_date_box) - 2 address_box = text_boxes[address_box_index] # In 2020: the account holder address is _before_ the ENEL address. We can tell if we # got the wrong address box if it's too short in lines. if address_box.count("\n") < 2: address_box_index = enel_address_index - 1 address_box = text_boxes[address_box_index] account_holder_name = extract_account_holder_from_address(address_box) # In 2018, the address was before the customer number instead, try again. if account_holder_name == "Periodo": customer_id_box = find_box_starting_with(text_boxes, "N° CLIENTE\n") assert customer_id_box customer_id_box_index = text_boxes.index(customer_id_box) address_box = text_boxes[customer_id_box_index - 1] account_holder_name = extract_account_holder_from_address(address_box) # The date follows the invoice number, look for the invoce number, then take the next. invoice_number_box = find_box_starting_with(text_boxes, "N. Fattura ") assert invoice_number_box date_box_index = text_boxes.index(invoice_number_box) + 1 date_box = text_boxes[date_box_index] bill_date = datetime.datetime.strptime(date_box, "Del %d/%m/%Y\n") return NameComponents(bill_date, "ENEL Energia", account_holder_name, "Bolletta",)
def try_o2(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("o2") if "Telefónica UK Limited" not in text_boxes[-1]: return None assert text_boxes[0] == "Copy Bill\n" fields_box = text_boxes[1] values_box = text_boxes[2] bill_info = build_dict_from_fake_table(fields_box, values_box) bill_date = dateparser.parse(bill_info["Bill date"], languages=["en"]) address_box = text_boxes[3] account_holder_name = extract_account_holder_from_address(address_box) return NameComponents(bill_date, "O2 UK", account_holder_name, "Bill",)
def try_scaleway(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("scaleway") if not find_box_starting_with(text_boxes, "Online SAS,"): return None customer_box = find_box_starting_with(text_boxes, "Customer \n") if customer_box: # Latest template account_holder = customer_box.split("\n")[1].strip() else: # Previous templates split this into two separate boxes. customer_label_idx = text_boxes.index("Customer\n") customer_box = text_boxes[customer_label_idx + 1] account_holder = customer_box.strip() date_box = find_box_starting_with(text_boxes, "Issued: \n") if date_box: # Latest template date_str = date_box.split("\n")[1].strip() else: # We need to find teh Issued line that is mixed together with other items, so just # use regex to find it. for box in text_boxes: # We don't really use a strict regex here, but we do only extract the _date_ # part rather than the time, which is also present but useless to the # renaming. date_match = re.search( r"Issued: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4}) at [0-9]", box) if date_match: break else: logger.debug("Unable to find the invoice issue date.") assert date_match date_str = date_match.group(1) bill_date = dateparser.parse(date_str) return NameComponents(bill_date, "Scaleway", account_holder, "Invoice")
def _try_old_hyperoptic(text_boxes, logger) -> Optional[NameComponents]: if (text_boxes[0] == "www.hyperoptic.com\n" or text_boxes[0] == "www.hyperoptic.com \n"): account_holder_box = text_boxes[1] elif len(text_boxes) > 8 and text_boxes[7] == "www.hyperoptic.com \n": account_holder_box = text_boxes[0] else: return None logger.debug("looking for customer name in %r", account_holder_box) account_holder_match = re.search(r"Customer Name: ([^\n]+)\n", account_holder_box) assert account_holder_match account_holder_name = account_holder_match.group(1) # Extract the bill date from a "fake table". # # Older (2017~2018) Hyperoptic bills have two multi-line text boxes, one including all # the labels, and the other including all of the values. # # They thankfully sit next to each other, so once one is found, it's possible to find # the invoice date with relative ease. titles_str = [box for box in text_boxes if box.startswith("DD Ref:\n")] assert len(titles_str) == 1 titles_idx = text_boxes.index(titles_str[0]) values_str = text_boxes[titles_idx + 1] document_info = build_dict_from_fake_table(titles_str[0], values_str) bill_date_str = document_info["Invoice date:"] bill_date = datetime.datetime.strptime(bill_date_str, "%d %b %Y") return NameComponents( bill_date, "Hyperoptic", account_holder_name, "Bill", )
def try_hounslow(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("hounslow") if not find_box_starting_with(text_boxes, "London Borough of Hounslow\n"): return None subject = text_boxes[2] if not subject.startswith("Council Tax Bill "): logger.debug("Not a council tax bill, unknown format.") return None bill_date = dateparser.parse(text_boxes[0], languages=["en"]) # In older bills, the subject box includes the address. if subject.count("\n") > 1: address_box = subject.split("\n", 1)[1] else: address_box = text_boxes[3] account_holder = extract_account_holder_from_address(address_box) # There can be more than one account holder, which makes things a bit more complicated. if "&" in account_holder: account_holders = [ drop_honorific(holder.strip()) for holder in account_holder.split("&") ] account_holder = ", ".join(account_holders) return NameComponents( bill_date, "LB Hounslow", account_holder, "Council Tax Bill", )
def try_hyperoptic(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("hyperoptic") # Check for very old templates, used in 2017 to 2018. old_bill = _try_old_hyperoptic(text_boxes, logger) if old_bill: return old_bill # All Hyperoptic objects on the page are logos, not text. But Hypernews is fairly # specific, too. is_hyperoptic = "Hypernews\n" in text_boxes # Older templates of the bills don't have "Hypernews", so we need to guess. If there's # a "DD Ref" field, and the following includes HYP, it's probably Hyperoptic. if not is_hyperoptic and "DD Ref:\n" in text_boxes: dd_ref_idx = text_boxes.index("DD Ref:\n") dd_ref = text_boxes[dd_ref_idx + 1] is_hyperoptic = "HYP" in dd_ref if not is_hyperoptic: return None account_idx = text_boxes.index("Name:\n") account_holder_name = text_boxes[account_idx + 1].strip() date_idx = text_boxes.index("Bill date:\n") date_str = text_boxes[date_idx + 1] bill_date = datetime.datetime.strptime(date_str, "%d %b %Y\n") return NameComponents( bill_date, "Hyperoptic", account_holder_name, "Bill", )
def try_santander(text_boxes, parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("santander") is_santander_credit_card = any(box == "Santander Credit Card \n" for box in text_boxes) if is_santander_credit_card: # Always include the account holder name, which is found in the second text box. account_holder_name = extract_account_holder_from_address( text_boxes[1]) # Could be an annual statement, look for it. is_annual_statement = any( box.startswith("Annual Statement:") for box in text_boxes) if is_annual_statement: document_type = "Annual Statement" period_line = [ box for box in text_boxes if box.startswith("Annual Statement:") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Annual Statement: [0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) else: document_type = "Statement" period_line = [ box for box in text_boxes if box.startswith("Account summary as at:") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Account summary as at: ([0-9]{1,2}[a-z]{2} [A-Z][a-z]+ [0-9]{4}) for card number ending [0-9]{4}\n$", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) return NameComponents( statement_date, "Santander", account_holder_name, "Credit Card", additional_components=(document_type, ), ) is_santander_select = any(box == "Select Current Account\n" for box in text_boxes) is_santander_123 = any(box == "1l2l3 Current Account earnings\n" for box in text_boxes) if is_santander_select or is_santander_123: # Always include the account holder name, which is found in the third text box. account_holder_name = extract_account_holder_from_address( text_boxes[2]) period_line = [ box for box in text_boxes if box.startswith("Your account summary for \n") ] assert len(period_line) == 1 logger.debug("found period specification: %r", period_line[0]) period_match = re.match( r"^Your account summary for \n[0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4} to ([0-9]{1,2}[a-z]{2} [A-Z][a-z]{2} [0-9]{4})\n$", period_line[0], ) assert period_match statement_date = dateparser.parse(period_match.group(1), languages=["en"]) if is_santander_select: account_type = "Select Current Account" elif is_santander_123: account_type = "123 Current Account" return NameComponents( statement_date, "Santander", account_holder_name, account_type, additional_components=("Statement", ), ) is_statement_of_fees = any(box == "Statement of Fees\n" for box in text_boxes) if is_statement_of_fees: # Always include the account holder name, which is found in the fourth text box. account_holder_name = extract_account_holder_from_address( text_boxes[3]) # Find the account this refers to. It's the text box after the title column. account_idx = text_boxes.index("Account\n") account_type = text_boxes[account_idx + 1].strip().title() # Find the date this statement was issued. It's the second text box after tht # title column (why?) date_idx = text_boxes.index("Date\n") date_str = text_boxes[date_idx + 2] # Unlike the other documents, this uses a normal date format. statement_date = datetime.datetime.strptime(date_str, "%d/%m/%Y\n") return NameComponents( statement_date, "Santander", account_holder_name, account_type, additional_components=("Statement of Fees", ), )
def try_schwab(text_boxes: Sequence[str], parent_logger) -> Optional[NameComponents]: logger = parent_logger.getChild("schwab") # Older brokerage accounts (2016) if text_boxes[0].startswith("Schwab One® International Account\n"): logger.debug("Schwab One brokerage account statement (2016).") address_index = text_boxes.index("Mail To\n") + 1 address_box = text_boxes[address_index] account_holder = extract_account_holder_from_address(address_box) assert account_holder statement_date = _find_statement_date(text_boxes, logger) return NameComponents( statement_date, "Schwab", account_holder, "Brokerage Statement" ) # Brokerage Accounts, Trade Confirmations and Year-End documents from 2017 onwards. if text_boxes[0].startswith("Schwab One® International Account"): account_holder = text_boxes[0].split("\n")[1].strip().title() assert account_holder if text_boxes[2] == "Trade Confirmation\n": logger.debug("Schwab One Trade Confirmation") logger.warning( "Cannot rename this document, as date is not present on the first page!" ) return None # Look for different types of year end documents. year_end_gain_losses = [ box for box in text_boxes if "Year-End Schwab Gain/Loss Report" in box ] year_end_summary = [box for box in text_boxes if "YEAR-END SUMMARY" in box] if year_end_gain_losses: logger.debug("Year End Gain/Loss Report") date_match = re.search( r"\nPrepared on ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n", year_end_gain_losses[0], ) assert date_match # Else we don't have the right document. document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Gain-Losses Report" elif year_end_summary: logger.debug("Year End Summary") date_box = find_box_starting_with(text_boxes, "Date Prepared: ") assert date_box date_match = re.search( r"^Date Prepared: ([A-Z][a-z]+ [0-9]{1,2}, [0-9]{4})\n$", date_box ) assert date_match document_date = dateparser.parse(date_match.group(1), languages=["en"]) document_type = "Year End Summary" else: logger.debug("Schwab One brokerage account statement.") document_date = _find_statement_date(text_boxes, logger) documen_type = "Brokerage Statement" return NameComponents(document_date, "Schwab", account_holder, document_type) # Letters if any( "Charles Schwab & Co., Inc. All rights reserved." in box for box in text_boxes ): logger.debug("Letter, possibly.") # Newer (2018) letters. if "Dear Client,\n" in text_boxes: date_str = text_boxes[0].split("\n")[0] logger.debug("Found date: %r", date_str) letter_date = dateparser.parse(date_str, languages=["en"]) # The address is two boxes before the "Dear Client,". address_index = text_boxes.index("Dear Client,\n") - 3 account_holder = extract_account_holder_from_address( text_boxes[address_index] ) else: account_holder = extract_account_holder_from_address(text_boxes[0]) letter_date = dateparser.parse(text_boxes[1], languages=["en"]) assert account_holder return NameComponents(letter_date, "Schwab", account_holder, "Letter")