def test_bill_data_from_pdf_8(self): service_account = "3024769590" meter_serial = "000214831C" actual = bill_data_from_pdf( self.data["atmos-example-08.pdf"], service_account, meter_serial ) expected = [ BillingDatum( start=date(2018, 3, 24), end=date(2018, 4, 23), statement=date(2018, 4, 23), cost=522.13, used=750.0 * 1.036, peak=None, items=None, attachments=None, utility_code=None, ), BillingDatum( start=date(2018, 4, 24), end=date(2018, 5, 24), statement=date(2018, 5, 24), cost=745.26, used=1104.0 * 1.036, peak=None, items=None, attachments=None, utility_code=None, ), ] self.assertEqual(expected, actual)
def test_multi_bill_electric_cost_only(self): expected = [ BillingDatum( start=date(2020, 12, 3), end=date(2021, 1, 4), statement=date(2021, 2, 18), cost=132444.44, used=None, peak=None, items=None, attachments=None, utility_code=None, ), BillingDatum( start=date(2021, 1, 5), end=date(2021, 2, 2), statement=date(2021, 2, 18), cost=122005.01, used=None, peak=None, items=None, attachments=None, utility_code=None, ), ] self.assertEqual( expected, parse_pdf( "datafeeds/scrapers/tests/fixtures/ladwp-2020-12-multi.pdf", "1BPMYVL000231", "kw", ), )
def _alternate_section(filename: str, bill_date: date, meter_number: str, pdf_text: str) -> List[BillingDatum]: regexes = kw_regexes(meter_number) # try multiple bills option first: with open(filename, "rb") as f: pdf_data = f.read() # Use PyPDF2 here to extract the individual bill costs beside their bill dates. alt_pdf_text = extract_pdf_text(BytesIO(pdf_data)) sub_bills = re.findall(regexes["alt_3_multi"], alt_pdf_text) if sub_bills: billing_data = [] for bill in sub_bills: datum = BillingDatum( start=parse_date(bill[0]).date(), end=parse_date(bill[1]).date() - timedelta(days=1), statement=bill_date, cost=str_to_float(bill[2]), used=None, peak=None, attachments=None, utility_code=None, items=None, ) billing_data.append(datum) log.info("alternate regex 3: data=%s", datum) return billing_data else: date_usage = re.search(regexes["alt1_date_usage"], pdf_text) if date_usage: used = str_to_float(date_usage.group(3)) else: date_usage = re.search(regexes["alt2_date_usage"], pdf_text) used = 0 cost = str_to_float(re.search(regexes["alt1_cost"], pdf_text).group(1)) peak_match = re.search(regexes["alt1_peak"], pdf_text, re.DOTALL) if date_usage and cost: datum = BillingDatum( start=parse_date(date_usage.group(1)).date(), end=parse_date(date_usage.group(2)).date() - timedelta(days=1), statement=bill_date, cost=cost, used=used, peak=str_to_float(peak_match.group(1)) if peak_match else None, attachments=None, utility_code=None, items=None, ) log.info("alternate regex 1: data=%s", datum) return [datum] raise Exception( "Error parsing pdf %s for %s: no billing section found", filename, meter_number, )
def test_alt_regexp(self): """Parser can extract data from a bill with alternate set of regular expressions.""" pattern = "datafeeds/scrapers/tests/fixtures/ladwp-%s.pdf" expected = [ BillingDatum( start=date(2020, 9, 2), end=date(2020, 10, 1), statement=date(2020, 10, 5), cost=386.52, used=840, peak=6, items=None, attachments=None, utility_code=None, ), ] self.assertEqual( expected, parse_pdf(pattern % "202010", "PMY00219-00010473", "kw") ) expected = [ BillingDatum( start=date(2020, 6, 5), end=date(2020, 7, 5), statement=date(2020, 7, 7), cost=357.98, used=720, peak=4.8, items=None, attachments=None, utility_code=None, ), ] self.assertEqual( expected, parse_pdf(pattern % "202007", "PMY00219-00010473", "kw") ) expected = [ BillingDatum( start=date(2019, 12, 12), end=date(2020, 1, 13), statement=date(2020, 1, 14), cost=47.48, used=0, peak=None, items=None, attachments=None, utility_code=None, ), ] self.assertEqual( expected, parse_pdf(pattern % "202001", "00106-00095149", "kw") )
def test_third_party_expected_for_NEM_reverse_flow_channel(self): """Test that the presence of a reverse flow channel means we are unsure if a service is bundled for a given billing period. """ b = Bill( start=datetime(2019, 2, 28, 0), duration=timedelta(days=27), used_unit="Wh", used=0, _line_items=[], cost=35.51, published=datetime(2017, 2, 1), usage_point="test_usage_point", subscription="test", tariff="B10S", ) reading_type = ReadingType( kind="energy", commodity="electricity SecondaryMetered", unit_of_measure="Wh", flow_direction="reverse", self_url= "https://api.pge.com/GreenButtonConnect/espi/1_1/resource/ReadingType/test='", artifact=self.artifact, ) db.session.add(reading_type) db.session.flush() interval_data = IntervalData( usage_point="test_usage_point", subscription="test", readings=[1] * 96, start=datetime(2019, 2, 28, 0), duration=timedelta(days=1), reading_type_oid=reading_type.oid, artifact=self.artifact, self_url= "https://api.pge.com/GreenButtonConnect/espi/1_1/resource/Subscription/test/UsagePoint/" "test_usage_point/MeterReading/test==/IntervalBlock/test", ) db.session.add(interval_data) actual = b.to_billing_datum(self.meter.utility_service) expected = BillingDatum( start=date(2019, 3, 1), end=date(2019, 3, 27), cost=35.51, used=0, peak=None, items=[], statement=date(2019, 3, 27), attachments=None, utility_code="B10S", utility_account_id=None, utility="utility:pge", service_id=None, third_party_expected=None, ) self.assertEqual(actual, expected)
def test_third_party_expected_for_NEM_tariff(self): b = Bill( start=datetime(2019, 2, 28, 0), duration=timedelta(days=27), used_unit="Wh", used=0, _line_items=[], cost=35.51, published=datetime(2017, 2, 1), usage_point="test_usage_point", subscription="test", tariff="NEM A-1-B", ) actual = b.to_billing_datum(self.meter.utility_service) expected = BillingDatum( start=date(2019, 3, 1), end=date(2019, 3, 27), cost=35.51, used=0, peak=None, items=[], statement=date(2019, 3, 27), attachments=None, utility_code="NEM A-1-B", utility_account_id=None, utility="utility:pge", service_id=None, third_party_expected=None, ) self.assertEqual(actual, expected)
def download_and_attach_pdf( self, bill_data: BillingDatum, billing_row: sce_pages.BillingDataRow ) -> BillingDatum: self.clear_pdf_downloads() bill_path = self.download_pdf_for_billing_row(billing_row) if bill_path: with open(bill_path, "rb") as bill_file: key = bill_upload.hash_bill_datum(self.service_id, bill_data) + ".pdf" return bill_data._replace( attachments=[ bill_upload.upload_bill_to_s3( bill_file, key, statement=bill_data.statement, source="sce.com", utility=self.utility, utility_account_id=self.utility_account_id, ) ] ) else: log.info( "No pdf bill was available for this period: %s to %s", bill_data.start, bill_data.end, ) return bill_data
def test_bill_conversion_gas(self): """An SMD bill can be converted to a BillingDatum, with appropriate unit conversions for gas meters.""" b = Bill( start=datetime(2017, 4, 25, 7), duration=timedelta(days=29), used_unit="therm", used=4400, _line_items=[], cost=4609.48, ) actual = b.to_billing_datum() expected = BillingDatum( start=date(2017, 4, 26), end=date(2017, 5, 24), cost=4609.48, used=4400, peak=None, items=[], statement=date(2017, 5, 24), attachments=None, utility_code="", ) self.assertEqual(expected.start, actual.start) self.assertEqual(expected.end, actual.end) self.assertEqual(expected.cost, actual.cost) self.assertEqual(expected.used, actual.used) self.assertEqual(expected.peak, actual.peak) self.assertEqual(expected.items, actual.items)
def test_third_party_expected_for_gas(self): b = Bill( start=datetime(2017, 1, 1, 7), duration=timedelta(days=29), used_unit="therm", used=679.0, _line_items=[], cost=625.64, published=datetime(2017, 2, 1), usage_point="test_usage_point", subscription="test", ) actual = b.to_billing_datum(self.meter.utility_service) expected = BillingDatum( start=date(2017, 1, 2), end=date(2017, 1, 30), cost=625.64, used=679.0, peak=None, items=[], statement=date(2017, 1, 30), attachments=None, utility_code=None, utility_account_id=None, utility="utility:pge", service_id=None, third_party_expected=None, ) self.assertEqual(actual, expected)
def parse_bill_text(text: str, meter_number: str) -> Optional[BillingDatum]: sections = [ s for s in re.split(r"ITEM \d+ - ", text) if s.startswith("ELECTRIC SERVICE") and meter_number in s ] if not sections: return None amount = extract_amount(sections[0]) period = extract_period(sections[0], meter_number) use = extract_use(sections[0], meter_number) peak = extract_peak(sections[0], meter_number) statement_date = extract_statement_date(sections[0]) utility_code = extract_utility_code(sections[0]) if (amount is not None and period is not None and use is not None and peak is not None): return BillingDatum( start=period[0], end=period[1], statement=statement_date if statement_date is not None else period[1], cost=amount, used=use, peak=peak, items=None, attachments=None, utility_code=utility_code, ) return None
def process_bill(text: str, service_account: str, meter_serial: str) -> Optional[BillingDatum]: matches = _service_account_pattern.search(text) if not matches or matches.group(1) != service_account: return None bill_dates_pattern = ( r"FromToPreviousPresent%s(\d+/\d+/\d\d)(\d+/\d+/\d\d)" % meter_serial) try: total_due = float(_total_due_pattern.search(text).group(1)) date_match = re.search(bill_dates_pattern, text) start = datetime.strptime(date_match.group(1), "%m/%d/%y").date() end = datetime.strptime(date_match.group(2), "%m/%d/%y").date() use = (float(_use_pattern.search(text).group(1)) * 1.036 ) # Convert CCF to therms. except (ValueError, AttributeError): return None return BillingDatum( start=start, end=end, statement= end, # statement date is not visible in the bill PDF text; use end date cost=total_due, used=use, peak=None, items=None, attachments=None, utility_code=None, )
def view_generation_usage_action( self, page: sce_pages.SceBilledGenerationUsageModal): """Scrape generation bill data; these are displayed on a different modal than bundled/T&D bills.""" gen_billing_objects: List[BillingDatum] = [] gen_values = page.parse_data() log.debug("generation values=%s", gen_values) for item in self.billing_history: # get generation cost for this bill date; try +- 1 day for offset in [-1, 0, 1]: value = gen_values.get(item.end - timedelta(days=offset)) if value is not None: break if value is None: log.debug("no generation data for %s; skipping", item.end) continue gen_billing_objects.append( BillingDatum( start=item.start, end=item.end, statement=item.statement, cost=value, used=item.used, peak=item.peak, items=item.items, attachments=item.attachments, utility_code=self.utility_tariff_code, service_id=self.gen_service_id, utility="utility:clean-power-alliance", )) log.info("created %s generation billing objects", len(gen_billing_objects)) log.debug("gen_billing_objects=%s", gen_billing_objects) self.gen_billing_history = gen_billing_objects
def test_update_billing_range(self, index_etl_run): # no bills bills: BillingData = [] task_id = "abc123" index.update_billing_range(task_id, bills) self.assertEqual(0, index_etl_run.call_count) # with bills end = date.today() - timedelta(days=7) for _ in range(3): start = end - timedelta(days=30) bills.append( BillingDatum( start=start, end=end, statement=end, cost=100.0, used=100.0, peak=10.0, items=[], attachments=[], utility_code="ABC", )) end = start - timedelta(days=1) index.update_billing_range(task_id, bills) expected = { "billingFrom": min(b.start for b in bills), "billingTo": max(b.end for b in bills), } self.assertEqual(expected, index_etl_run.call_args[0][1])
def _capture_current_rows( self, search_start: date, search_end: date) -> List[Tuple[BillingDatum, Optional[bytes]]]: """Capture all of the billing data presented in the on-screen table.""" results = [] rows = self.driver.find_elements_by_xpath("//tbody/tr") for row in rows: log.info("Acquiring bill data.") period = row.find_element_by_xpath("./td[3]").text parts = period.split(" - ") if len(parts) != 2: continue start = datetime.strptime(parts[0], "%m/%d/%Y").date() end = datetime.strptime(parts[1], "%m/%d/%Y").date() if start < search_start or search_end < end: continue # This bill is irrelevant to the current scraping run. charge = row.find_element_by_xpath("./td[5]").text cost = float(charge.replace("$", "").replace(",", "")) used = float(row.find_element_by_xpath("./td[9]").text) log.info("Downloading PDF.") pdf_link = row.find_element_by_xpath(".//a") statement = datetime.strptime(pdf_link.text, "%m/%d/%Y").date() pdf_link.click() # PDFs look like <UUID>.pdf time.sleep(5) # Wait for PDF to fully download. pdf_data = None pdfs = glob(os.path.join(self.driver.download_dir, "*.pdf")) if len(pdfs) == 1: with open(pdfs[0], "rb") as f: pdf_data = f.read() log.info("Acquired PDF from %s." % pdfs[0]) os.remove(pdfs[0]) bill = BillingDatum( start=start, end=end, statement=statement, cost=cost, used=used, peak=None, items=None, attachments=None, utility_code=None, ) results.append((bill, pdf_data)) if not results: log.info("No billing history was present.") return results
def make_billing_datum(self, bill_detail: BillPeriodDetails) -> BillingDatum: """Convert a billing detail summary from the website to a Gridium BillingDatum object""" # get statement date from link: Date=yyyy-mm-dd date_re = re.compile(r"Date=(\d\d\d\d-\d\d-\d\d)") match = ( date_re.search(bill_detail.download_link) if bill_detail.download_link else None ) statement = None if match: try: statement = parse_date(match.group(1)).date() except Exception as exc: log.warning("error parsing date %s: %s", match.group(1), exc) if not statement: statement = bill_detail.end bill_datum = BillingDatum( start=bill_detail.start, end=bill_detail.end, statement=statement, cost=bill_detail.total_charges, used=bill_detail.total_kwh, peak=bill_detail.max_kw, items=None, attachments=None, utility_code=bill_detail.utility_code, ) pdf_bytes = self.download_pdf(bill_detail) if pdf_bytes: key = bill_upload.hash_bill_datum(self.account_id, bill_datum) attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(pdf_bytes), key, source="smud.org", statement=statement, utility=self.utility, utility_account_id=self.account_id, ) if attachment_entry: bill_datum = bill_datum._replace(attachments=[attachment_entry]) return bill_datum
def _execute(self): login_page = LoginPage(self._driver) home_page = login_page.login(self.keller_id, self.password) self.screenshot("home_page") bill_history_page = home_page.to_bill_history() self.screenshot("bill_history_page") bills = bill_history_page.gather_data(self.keller_id, self.start_date, self.end_date) log.info("Acquired %d bills (%s bytes total)." % (len(bills), sum(len(b) for b in bills))) bill_data = [] for b in bills: bill_datum = parse_bill_pdf(BytesIO(b)) if bill_datum is None: continue key = bill_upload.hash_bill_datum(self.account_number, bill_datum) # bill doesn't have a statement date; use end date attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(b), key, statement=bill_datum.end, source="cityofkeller.com", utility=self.utility, utility_account_id=self.account_number, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) # bill periods overlap; adjust start dates adjusted_bill_data = [] for bill in bill_data: adjusted_bill_data.append( BillingDatum( start=bill.start + timedelta(days=1), end=bill.end, statement=bill.statement, cost=bill.cost, used=bill.used, peak=bill.peak, items=bill.items, attachments=bill.attachments, utility_code=None, )) final_bills = adjust_bill_dates(adjusted_bill_data) show_bill_summary(final_bills, "Final Bill Summary") return Results(bills=final_bills)
def process_pdf( utility: str, utility_account_id: str, service_id: str, statement_dt: date, pdf_filename: str, ) -> BillingDatum: log.info("Parsing text from PDF %s", pdf_filename) text = pdfparser.pdf_to_str(pdf_filename) cost = extract_cost(text) used = extract_used(text) demand = extract_demand(text) start_date, end_date = extract_dates(text) # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020) if start_date > statement_dt: start_date = start_date.replace(year=statement_dt.year) end_date = end_date.replace(year=statement_dt.year) # end_date must be after start date (end_date = 1/5, start_date = 12/1) if end_date < start_date: end_date = end_date.replace(year=end_date.year + 1) # adjust end date because SVP bills overlap on start/end dates end_date = end_date - timedelta(days=1) line_items: List[BillingDatumItemsEntry] = extract_line_items(text) key = hash_bill( service_id, start_date, end_date, cost, demand, used, ) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="mua.santaclaraca.gov", statement=end_date, utility=utility, utility_account_id=utility_account_id, ) return BillingDatum( start=start_date, end=end_date, statement=statement_dt, cost=cost, used=used, peak=demand, items=line_items, attachments=[attachment_entry], utility_code=None, )
def test_bill_parse_04(self): actual = parse_bill_pdf(self.content[3]) expected = BillingDatum( start=date(2019, 5, 21), end=date(2019, 6, 21), statement=date(2019, 6, 21), cost=332.96, used=43500 / 748.052, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_06(self): actual = parse_bill_pdf(self.content[5]) expected = BillingDatum( start=date(2019, 3, 20), end=date(2019, 4, 22), statement=date(2019, 4, 22), cost=325.54, used=42500 / 748.052, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_05(self): actual = parse_bill_pdf(self.content[4]) expected = BillingDatum( start=date(2019, 4, 22), end=date(2019, 5, 21), statement=date(2019, 5, 21), cost=71.44, used=0.0, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_08(self): actual = parse_bill_pdf(self.content[7]) expected = BillingDatum( start=date(2019, 8, 21), end=date(2019, 9, 20), statement=date(2019, 9, 20), cost=52.70, used=0.0, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_02(self): actual = parse_bill_pdf(self.content[1]) expected = BillingDatum( start=date(2019, 4, 22), end=date(2019, 5, 21), statement=date(2019, 5, 21), cost=1014.22, used=73800 / 748.052, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_09(self): actual = parse_bill_pdf(self.content[8]) expected = BillingDatum( start=date(2019, 8, 21), end=date(2019, 9, 19), statement=date(2019, 9, 19), cost=1270.05, used=104400.0 / 748.052, peak=None, items=None, attachments=None, utility_code=None, ) self.assertEqual(expected, actual)
def test_bill_parse_05(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_04.pdf"], "13714552") expected = BillingDatum( start=date(2017, 2, 1), end=date(2017, 3, 1), statement=date(2017, 3, 9), cost=780622.38, used=8271000.0 + 6061000.0, peak=35767.0, items=None, attachments=None, utility_code="Schedule 748", ) self.assertEqual(expected, actual)
def test_bill_parse_04(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_03.pdf"], "78585187") expected = BillingDatum( start=date(2019, 5, 31), end=date(2019, 6, 30), statement=date(2019, 7, 18), cost=185.86, used=0.0, peak=5.0, items=None, attachments=None, utility_code="Schedule 28", ) self.assertEqual(expected, actual)
def test_bill_parse_02(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_01.pdf"], "66887643") expected = BillingDatum( start=date(2019, 6, 3), end=date(2019, 7, 2), statement=date(2019, 7, 10), cost=732.86, used=6972.0, peak=26.0, items=None, attachments=None, utility_code="Schedule 28", ) self.assertEqual(expected, actual)
def test_bill_parse_01(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_01.pdf"], "13714552") expected = BillingDatum( start=date(2019, 6, 1), end=date(2019, 7, 1), statement=date(2019, 7, 10), cost=850402.64, used=11300000 + 9143000, peak=29960, items=None, attachments=None, utility_code="Schedule 748", ) self.assertEqual(expected, actual)
def test_bill_parse_07(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_06.pdf"], "13714552") expected = BillingDatum( start=date(2017, 12, 1), end=date(2018, 1, 1), statement=date(2018, 1, 12), cost=914721.55, used=10616000.0 + 9113000.0, peak=27278.0, items=None, attachments=None, utility_code="Schedule 748", ) self.assertEqual(expected, actual)
def test_bill_parse_06(self): actual = pacific_power.parse_bill_pdf( self.data["pacific_power_test_05.pdf"], "66887643") expected = BillingDatum( start=date(2017, 7, 3), end=date(2017, 8, 2), statement=date(2017, 8, 7), cost=222.02, used=1846, peak=11, items=None, attachments=None, utility_code="Schedule 23", ) self.assertEqual(expected, actual)
def test_single_account(self, _notify): """Parser can extract data from a single-account bill.""" filename = "datafeeds/scrapers/tests/fixtures/ladwp-single.pdf" expected = BillingDatum( start=date(2020, 9, 10), end=date(2020, 10, 12), statement=date(2020, 10, 13), cost=115955.98, used=571680, peak=1215.36, items=None, attachments=None, utility_code=None, ) self.assertEqual([expected], parse_pdf(filename, "PMY2V00231-00001054", "kw"))