def process_pdf( utility: str, utility_account_id: str, service_id: str, statement_dt: date, pdf_filename: str, ) -> BillingDatum: log.info("Parsing text from PDF %s", pdf_filename) text = pdfparser.pdf_to_str(pdf_filename) cost = extract_cost(text) used = extract_used(text) demand = extract_demand(text) start_date, end_date = extract_dates(text) # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020) if start_date > statement_dt: start_date = start_date.replace(year=statement_dt.year) end_date = end_date.replace(year=statement_dt.year) # end_date must be after start date (end_date = 1/5, start_date = 12/1) if end_date < start_date: end_date = end_date.replace(year=end_date.year + 1) # adjust end date because SVP bills overlap on start/end dates end_date = end_date - timedelta(days=1) line_items: List[BillingDatumItemsEntry] = extract_line_items(text) key = hash_bill( service_id, start_date, end_date, cost, demand, used, ) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="mua.santaclaraca.gov", statement=end_date, utility=utility, utility_account_id=utility_account_id, ) return BillingDatum( start=start_date, end=end_date, statement=statement_dt, cost=cost, used=used, peak=demand, items=line_items, attachments=[attachment_entry], utility_code=None, )
def parse_pdf(pdf_filename: str, utility: str, utility_account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) if "Your Energy Bill" in text: log.info("parsing new-style PDF %s", pdf_filename) data = parse_new_pdf(text) else: log.info("parsing old-style PDF %s", pdf_filename) data = parse_old_pdf(text) key = hash_bill(utility_account_id, data.start, data.end, data.cost, data.peak, data.used) with open(pdf_filename, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) return data._replace(attachments=[attachment_entry])
def get_bills(self, utility: str, utility_account_id: str) -> List[BillingDatum]: billing_data = [] available_dates = self.driver.find_elements( By.CSS_SELECTOR, "table.table-alt a.bill-view-link") available_dates = [parse_date(i.text).date() for i in available_dates] log.info("available dates: %s", [dt.strftime("%Y-%m-%d") for dt in available_dates]) xpath_locators = { # Finds the last KWH reading under Total Usage column "cost": "//table[contains(., 'NEW CHARGES')]/tbody/tr/td[3]", "used": "(//table[contains(.,'USAGE')]//tr/td[contains(., 'KWH')])", "usage_kw": "//table[contains(.,'USAGE')]//tr/td[contains(.,'KW') and not(contains(.,'KWH'))]", } # loop through dates in table in ascending order for pdf_date in reversed(available_dates): # skip if the date isn't in the specified range if not (self.start_date <= pdf_date <= self.end_date): log.debug("skipping date outside range: %s", pdf_date) continue view_bill_link = self.driver.find_element_by_xpath( '//a[.="%s"]' % pdf_date.strftime("%m/%d/%Y")) scroll_to(self.driver, view_bill_link) self.driver.sleep(0.5) view_bill_link.click() self.driver.wait(30).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "div.billImage"))) start_date = None end_date = None cost = None used = None peak = None dates_line_text: str = self.driver.find_element_by_xpath( "//td[contains(., 'Service From:')]").text dates_match = re.search( r"Service From: (?P<from>\w+ \d\d) to (?P<to>\w+ \d\d) \(\d\d Days\)", dates_line_text, ) if dates_match: # if from month is December, use previous year year = (pdf_date.year - 1 if "dec" in dates_match.group("from").lower() else pdf_date.year) start_date = parse_date("%s %s" % (dates_match.group("from"), year)) end_date = parse_date( dates_match.group("to") + pdf_date.strftime(" %Y")) cost_match = self.driver.find(xpath_locators["cost"], xpath=True) if cost_match: cost = cost_match.text cost = float(cost.replace("$", "").replace(",", "")) kwh_usages = [] for match in self.driver.find_all(xpath_locators["used"], xpath=True): # include only if it has a reading values as siblings; exclude credit line items parent = match.find_element_by_xpath("..") # meter number, previous reading, current reading readings_text = "" for idx, child in enumerate( parent.find_elements_by_xpath(".//td")): log.debug("\t%s\t%s", idx, child.text.strip()) readings_text += child.text.strip() if idx == 2: break if not readings_text: log.info("skipping non-reading line item: %s", parent.text) continue kwh_value = float( match.text.replace("KWH", "").replace(",", "").strip()) kwh_usages.append(kwh_value) if kwh_usages: used = sum(kwh_usages) kw_usages = [] for usage_kw_match in self.driver.find_all( xpath_locators["usage_kw"], xpath=True): kw_usages.append( float( usage_kw_match.text.replace("KW", "").replace(",", "").strip())) if kw_usages: peak = max(kw_usages) data = BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=end_date - timedelta(days=1), cost=cost, peak=peak, used=used, items=None, attachments=None, utility_code=None, ) self.driver.find("a#billImageToPrint").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) # the filename of the printed pdf is f"{current page title}.pdf" self.driver.execute_script("window.print();") try: file_exists_in_dir(directory=self.download_dir, pattern=r"^Bill View Bill Image.pdf$") except Exception: raise Exception("Unable to download file for %s" % pdf_date) curr_path = os.path.join(self.download_dir, "Bill View Bill Image.pdf") new_path = os.path.join( self.download_dir, f"bill_{pdf_date.strftime('%Y-%m-%d')}.pdf") os.rename(curr_path, new_path) log.info("parsed bill for %s - %s", data.start, data.end) self.driver.find("a#close").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.sleep(1) # upload PDF: key = hash_bill( utility_account_id, data.start, data.end, data.cost, data.peak, data.used, ) with open(new_path, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) if attachment_entry: data = data._replace(attachments=[attachment_entry]) billing_data.append(data) # Click Bill Information in breadcrumbs to go back to bills list page self.driver.find("a#billInformation").click() return billing_data
def extract_bill_data(pdf_filename, service_id, utility, utility_account_id) -> Optional[BillingDatum]: # this function should upload the file to s3 to set attachments? try: text = pdf_to_str(pdf_filename) except PDFSyntaxError: log.exception("Downloaded bill file failed to parse as a PDF.") return None current_charges_pattern = "Current Charges(.*?)Cycle" for line in (re.search(current_charges_pattern, text, re.DOTALL).group(1).split("\n")): # get the last number if re.match(r"[\d,\.]", line.strip()): current_charges = line.strip().replace(",", "") period_start, period_end = extract_bill_period(pdf_filename) usage_pattern = r"Energy Charges \((\d*) kWh\)" usage = re.search(usage_pattern, text).groups()[0] on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW" on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0] offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW" offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0] bill_attachment = [] if config.enabled("S3_BILL_UPLOAD"): log.info("S3_BILL_UPLOAD is enabled") with open(pdf_filename, "rb") as f: key = hash_bill( service_id, period_start, period_end, _format_number(current_charges), 0, _format_number(usage), ) # no statement date; use end date bill_attachment.append( upload_bill_to_s3( f, key, source="portlandgeneral.com", statement=period_end, utility=utility, utility_account_id=utility_account_id, )) log.info("Uploaded bill %s to s3", bill_attachment) bill = BillingDatum( start=period_start, end=period_end, statement=period_end, cost=_format_number(current_charges), used=_format_number(usage), peak=max( float(on_peak_demand), float(offpeak_demand), ), items=[], attachments=bill_attachment, utility_code=None, ) return bill
def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @" cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @" # date format: m/d/yyyy date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}" dates_pattern = ( r"Total Current Charges.+?" fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})" fr"(?P<due_date>{date_pattern})" fr"(?P<statement_date>{date_pattern})") dates_match = re.search(dates_pattern, text) if not dates_match: raise InvalidMeterDataException( f"Couldn't parse dates from pdf: {text}") _dates = dates_match.group("read_date_start", "read_date_end", "statement_date") start_date, end_date, statement_date = [ parse_date(_date).date() for _date in _dates ] used_match = re.search(used_pattern, text) if not used_match: raise InvalidMeterDataException( "fCouldn't parse usage from pdf: {text}") used_text = used_match.group("units_used") used = float(used_text.replace(",", "").replace("$", "")) cost_match = re.search(cost_pattern, text) if not cost_match: raise InvalidMeterDataException( f"Couldn't parse cost from pdf: {text}") cost_text = cost_match.group("water_charges") cost = float(cost_text.replace(",", "").replace("$", "")) if config.enabled("S3_BILL_UPLOAD"): key = hash_bill(account_id, start_date, end_date, cost, 0, used) with open(pdf_filename, "rb") as pdf_data: attachments = [ upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="customerconnect.poway.org", statement=statement_date, utility="utility:city-of-poway", utility_account_id=account_id, ) ] else: attachments = [] return BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=statement_date, cost=cost, peak=None, used=used, items=None, attachments=attachments, utility_code=None, )
def download_bills( self, latest: date, utility_account: str, utility: str, gen_utility: Optional[str] = None, gen_utility_account_id: Optional[str] = None, ) -> List[BillPdf]: """Download bill PDFs for the specified date range.""" pdfs: List[BillPdf] = [] log.info("Opening billing history") click(self._driver, css_selector="#arrowBillPaymentHistory") self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot( BaseWebScraper.screenshot_path("bill history arrow")) wait_for_block_overlay(self._driver) log.info("Clicking 'view up to..' link") click(self._driver, css_selector=self.ViewMoreHistorySel) self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot(BaseWebScraper.screenshot_path("panels")) panels_count = len( self._driver.find_elements_by_css_selector(self.PanelxSel)) log.info(f"found {panels_count} panels in billing widget") # Rather than get all matching elements and iterate through, use index # and manually get element each time to help avoid stale element errors for i in range(0, panels_count): panel = self._driver.find_elements_by_css_selector( self.PanelxSel)[i] # check if is a payment panel panel_header = panel.find_element_by_css_selector(".panel-title") header_text = panel_header.text if "Payment" in header_text: log.debug(f"Skipping panel {i} (payment)") # skip if is a payment panel continue log.debug(f"Processing panel {i} (bill): {header_text}") link_elem = panel.find_element_by_css_selector( "div.pge_coc-dashboard-viewPay_billed_history_panel_viewBill_para_block" " a.viewBill") # Get date from the "data-date" attribute on link to download bill... # data-date is in milliseconds timestamp = int(link_elem.get_attribute("data-date")) / 1000.0 # when bill was issued bill_date = datetime.fromtimestamp(timestamp).date() # bill issued about a week after end date; use this window to match dates approx_bill_end = bill_date - timedelta(days=7) approx_bill_start = approx_bill_end - timedelta(days=30) log.debug(f"bill date={bill_date}") # cost is in second column cost_text = panel.find_element_by_css_selector( "td.text-right").text log.debug(f"cost text={cost_text}") # cost with $ and commas: $1,234.56 or -$1,234.56 cost = float(cost_text.replace("$", "").replace(",", "")) log.info(f"Found bill issued {bill_date} with cost ${cost}") if approx_bill_end <= latest: log.info( f"ignoring bill, date: {approx_bill_end} already download") continue try: click(self._driver, elem=link_elem) except ElementNotInteractableException: log.info("Download link not visible; looking for other") link_elem = panel.find_element_by_css_selector( "div#billSummaryContainer a.viewBill") click(self._driver, elem=link_elem) except ElementClickInterceptedException as exc: log.info("download link failed: %s %s", exc, exc.msg) close_modal(self._driver) continue last4 = self.account_id.split("-")[0][6:10] filename = f"{last4}custbill{bill_date.strftime('%m%d%Y')}.pdf" download_dir = "%s/current" % config.WORKING_DIRECTORY try: self._driver.wait(60).until( file_exists_in_dir( # end pattern with $ to prevent matching filename.crdownload directory=download_dir, pattern=f"^{filename}$", )) except TimeoutException: log.error( f"ERROR waiting for file {filename} to download...skipping" ) # close the download failed modal if there is one close_modal(self._driver) continue with open("%s/%s" % (download_dir, filename), "rb") as f: key = hash_bill(self.account_id, approx_bill_start, approx_bill_end, cost, "", "") upload_bill_to_s3( file_handle=f, key=key, source="pge.com", statement=bill_date, utility=utility, utility_account_id=utility_account, gen_utility=gen_utility, gen_utility_account_id=gen_utility_account_id, ) log.info(f"Uploaded {filename} to {key}") pdfs.append( BillPdf( utility_account_id=utility_account, gen_utility_account_id=gen_utility, start=approx_bill_start, end=approx_bill_end, statement=bill_date, s3_key=key, )) return pdfs
def _execute(self): # Direct the driver to the login page self._driver.get(self.login_url) # Create page helpers login_page = LoginPage(self._driver) my_account_page = MyAccountPage(self._driver) bill_history_page = BillHistoryPage(self._driver) try: login_page.wait_until_ready() except Exception: self.screenshot("initial page load failed") # try one more time self._driver.get(self.login_url) login_page.wait_until_ready() login_page.login(self.username, self.password) self.screenshot("after login") my_account_page.wait_until_ready() my_account_page.navigate_to_bill_history() self.screenshot("bill history") if bill_history_page.too_many_sessions(): # waiting 5 minutes doesn't seem to help bill_history_page.logout() raise Exception("too many sessions") bill_history_page.wait_until_ready() self.screenshot("after captcha") if not bill_history_page.solve_captcha(): bill_history_page.logout() raise Exception("captcha failed") bill_history_page.wait_until_bills_ready() bill_history_page.select_account( self._configuration.utility_account_id, self._configuration.account_name) bill_history_page.wait_until_bills_ready() bill_history_page.download_bills(self.start_date, self.end_date) bill_history_page.logout() # get bills from download directory and parse bills: List[BillingDatum] = [] prefix = f"{config.WORKING_DIRECTORY}/current" log.info("Waiting for downloads to finish") while any(".pdf.crdownload" in f for f in os.listdir(prefix)): # Wait for downloads to finish time.sleep(1) continue start_dates: Set[date] = set() for filename in sorted(os.listdir(prefix)): if ".pdf" not in filename: continue log.info("parsing file %s" % filename) parsed_bills = parse_pdf(f"{prefix}/{filename}", self.meter_number, self.commodity) log.info(f"filename {filename} bills={parsed_bills}") if not parsed_bills: log.warning(f"no billing datum: filename={filename}") continue with open(prefix + "/" + filename, "rb") as pdf_data: bill = parsed_bills[0] key = hash_bill( self._configuration.utility_account_id, bill.start, bill.end, bill.cost, bill.peak, bill.used, ) attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.ladwp.com", statement=bill.end, utility="utility:ladwp", utility_account_id=self._configuration.utility_account_id, ) for bill in parsed_bills: attachments = [attachment_entry] if bill.start in start_dates: # if we already have a bill with this start date, replace it prev_bill = [b for b in bills if b.start == bill.start][0] log.info( "duplicate bill start: prev_bill = %s, bill = %s", prev_bill, bill, ) bills.remove(prev_bill) # copy the attachment attachments += prev_bill.attachments bills.append(bill._replace(attachments=attachments)) start_dates.add(bill.start) return Results(bills=bills)