def test_pdf_to_str(self): output = pdfparser.pdf_to_str( os.path.join(TEST_DIR, "fixtures", "test_portland_bizportal_bill.pdf")) self.assertEqual(type(output), str) self.assertGreater( len(output), 100) # The string for this pdf should be quite long. self.assertIn("Service Period", output) # Some text from this PDF
def test_parse_old_pdf3(self): """Verify that we can extract cost, use, and demand from another old-format PDF.""" text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_3.pdf") data = pdf_parser.parse_old_pdf(text) self.assertEqual(date(2020, 3, 20), data.start) self.assertEqual(date(2020, 4, 20), data.end) self.assertAlmostEqual(29.89, data.cost, 2) self.assertAlmostEqual(155, data.used, 2) self.assertIsNone(data.peak)
def test_parse_old_pdf4(self): """Verify that we can extract dates from an old-format PDF over a year boundary.""" text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_4.pdf") data = pdf_parser.parse_old_pdf(text) self.assertEqual(date(2019, 11, 27), data.start) self.assertEqual(date(2019, 12, 30), data.end) self.assertAlmostEqual(19.7, data.cost, 2) self.assertAlmostEqual(156, data.used, 2) self.assertIsNone(data.peak)
def test_parse_old_pdf2(self): """Verify that we can extract cost, use, and demand from another old-format PDF.""" text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_2.pdf") data = pdf_parser.parse_old_pdf(text) self.assertEqual(date(2018, 10, 29), data.start) self.assertEqual(date(2018, 12, 3), data.end) self.assertAlmostEqual(685.16, data.cost, 2) self.assertAlmostEqual(5581.0, data.used, 2) self.assertAlmostEqual(30.0, data.peak, 2)
def test_parse_new_pdf3(self): """Verify that we can extract cost, use, and demand from another new-format PDF.""" text = pdfparser.pdf_to_str("private_fixtures/duke_bill_new_3.pdf") data = pdf_parser.parse_new_pdf(text) self.assertEqual(date(2020, 4, 21), data.start) self.assertEqual(date(2020, 5, 23), data.end) self.assertAlmostEqual(3048.76, data.cost, 2) self.assertAlmostEqual(38320, data.used, 2) self.assertAlmostEqual(86, data.peak)
def test_parse_new_pdf1(self): """Verify that we can extract cost, use, and demand from June 2020+ version of PDF.""" text = pdfparser.pdf_to_str("private_fixtures/duke_bill_new_1.pdf") data = pdf_parser.parse_new_pdf(text) self.assertEqual(date(2020, 5, 10), data.start) self.assertEqual(date(2020, 6, 9), data.end) self.assertAlmostEqual(123113.66, data.cost, 2) self.assertAlmostEqual(1806000.0, data.used, 2) self.assertAlmostEqual(3840.0, data.peak, 2)
def process_pdf( utility: str, utility_account_id: str, service_id: str, statement_dt: date, pdf_filename: str, ) -> BillingDatum: log.info("Parsing text from PDF %s", pdf_filename) text = pdfparser.pdf_to_str(pdf_filename) cost = extract_cost(text) used = extract_used(text) demand = extract_demand(text) start_date, end_date = extract_dates(text) # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020) if start_date > statement_dt: start_date = start_date.replace(year=statement_dt.year) end_date = end_date.replace(year=statement_dt.year) # end_date must be after start date (end_date = 1/5, start_date = 12/1) if end_date < start_date: end_date = end_date.replace(year=end_date.year + 1) # adjust end date because SVP bills overlap on start/end dates end_date = end_date - timedelta(days=1) line_items: List[BillingDatumItemsEntry] = extract_line_items(text) key = hash_bill( service_id, start_date, end_date, cost, demand, used, ) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="mua.santaclaraca.gov", statement=end_date, utility=utility, utility_account_id=utility_account_id, ) return BillingDatum( start=start_date, end=end_date, statement=statement_dt, cost=cost, used=used, peak=demand, items=line_items, attachments=[attachment_entry], utility_code=None, )
def test_parse_old_pdf1(self): """Verify that we can extract cost, use, and demand from version of PDF prior to June 2020. This bill includes a prior balance due line item that should be excluded from bill total. """ text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_1.pdf") data = pdf_parser.parse_old_pdf(text) self.assertEqual(date(2020, 4, 10), data.start) self.assertEqual(date(2020, 5, 9), data.end) self.assertAlmostEqual(89972.41, data.cost, 2) self.assertAlmostEqual(1312000.0, data.used, 2) self.assertAlmostEqual(3000.0, data.peak, 2)
def extract_bill_period(pdf_filename): """Convert the PDF to a string so we can determine the dates this bill covers.""" try: text = pdf_to_str(pdf_filename) except PDFSyntaxError: log.exception("Downloaded bill file failed to parse as a PDF.") return None, None pattern = r"Service Period\n(\d+/\d+/\d+)\n(\d+/\d+/\d+)" match = re.search(pattern, text) if match: period_a = parse_time(match.group(1)).date() period_b = parse_time(match.group(2)).date() return min(period_a, period_b), max(period_a, period_b) return None, None
def parse_pdf(pdf_filename: str, utility: str, utility_account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) if "Your Energy Bill" in text: log.info("parsing new-style PDF %s", pdf_filename) data = parse_new_pdf(text) else: log.info("parsing old-style PDF %s", pdf_filename) data = parse_old_pdf(text) key = hash_bill(utility_account_id, data.start, data.end, data.cost, data.peak, data.used) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) return data._replace(attachments=[attachment_entry])
def test_additional_line_items(self): text = pdfparser.pdf_to_str("private_fixtures/svp_bill_3.pdf") data = pdf_parser.extract_line_items(text) self.assertEqual(data[0].total, 46.34)
def test_no_power_factor_charge(self): """The parser handles a bill without a Power Factor Charge line""" text = pdfparser.pdf_to_str("private_fixtures/svp_bill_2.pdf") data = pdf_parser.extract_line_items(text) self.assertEqual(data[3][3], None)
def extract_bill_data(pdf_filename, service_id, utility, utility_account_id) -> Optional[BillingDatum]: # this function should upload the file to s3 to set attachments? try: text = pdf_to_str(pdf_filename) except PDFSyntaxError: log.exception("Downloaded bill file failed to parse as a PDF.") return None current_charges_pattern = "Current Charges(.*?)Cycle" for line in (re.search(current_charges_pattern, text, re.DOTALL).group(1).split("\n")): # get the last number if re.match(r"[\d,\.]", line.strip()): current_charges = line.strip().replace(",", "") period_start, period_end = extract_bill_period(pdf_filename) usage_pattern = r"Energy Charges \((\d*) kWh\)" usage = re.search(usage_pattern, text).groups()[0] on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW" on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0] offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW" offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0] bill_attachment = [] if config.enabled("S3_BILL_UPLOAD"): log.info("S3_BILL_UPLOAD is enabled") with open(pdf_filename, "rb") as f: key = hash_bill( service_id, period_start, period_end, _format_number(current_charges), 0, _format_number(usage), ) # no statement date; use end date bill_attachment.append( upload_bill_to_s3( f, key, source="portlandgeneral.com", statement=period_end, utility=utility, utility_account_id=utility_account_id, )) log.info("Uploaded bill %s to s3", bill_attachment) bill = BillingDatum( start=period_start, end=period_end, statement=period_end, cost=_format_number(current_charges), used=_format_number(usage), peak=max( float(on_peak_demand), float(offpeak_demand), ), items=[], attachments=bill_attachment, utility_code=None, ) return bill
def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @" cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @" # date format: m/d/yyyy date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}" dates_pattern = ( r"Total Current Charges.+?" fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})" fr"(?P<due_date>{date_pattern})" fr"(?P<statement_date>{date_pattern})") dates_match = re.search(dates_pattern, text) if not dates_match: raise InvalidMeterDataException( f"Couldn't parse dates from pdf: {text}") _dates = dates_match.group("read_date_start", "read_date_end", "statement_date") start_date, end_date, statement_date = [ parse_date(_date).date() for _date in _dates ] used_match = re.search(used_pattern, text) if not used_match: raise InvalidMeterDataException( "fCouldn't parse usage from pdf: {text}") used_text = used_match.group("units_used") used = float(used_text.replace(",", "").replace("$", "")) cost_match = re.search(cost_pattern, text) if not cost_match: raise InvalidMeterDataException( f"Couldn't parse cost from pdf: {text}") cost_text = cost_match.group("water_charges") cost = float(cost_text.replace(",", "").replace("$", "")) if config.enabled("S3_BILL_UPLOAD"): key = hash_bill(account_id, start_date, end_date, cost, 0, used) with open(pdf_filename, "rb") as pdf_data: attachments = [ upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="customerconnect.poway.org", statement=statement_date, utility="utility:city-of-poway", utility_account_id=account_id, ) ] else: attachments = [] return BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=statement_date, cost=cost, peak=None, used=used, items=None, attachments=attachments, utility_code=None, )