def download_as_csv(self) -> str: download_as_csv_sel = '//*[text()="Download as CSV Spreadsheet"]' # click the download options button on top right of chart self.find_element("g.highcharts-exporting-group").click() # click Download as CSV self._driver.find_or_raise(download_as_csv_sel, xpath=True).click() log.info("Waiting for file to Download") try: filename = self._driver.wait(30).until( file_exists_in_dir( directory=self._driver.download_dir, pattern=r"^demand_heatmap_.+?\.csv$", )) except Exception: raise Exception("Unable to download file...") log.info(f"Download complete: {filename}") filepath = os.path.join(self._driver.download_dir, filename) # rename the file to avoid matching the wrong file in future new_filename = f"{date}_demand_heatmap.csv" new_filepath = os.path.join(self._driver.download_dir, new_filename) os.rename(filepath, new_filepath) return new_filepath
def download_csv(self) -> str: download_csv_xpath = "//div[contains(text(), 'Download CSV Data')]" download_csv = self.driver.find_element_by_xpath(download_csv_xpath) download_csv.click() download_dir = self.driver.download_dir filename = self.driver.wait(60).until( file_exists_in_dir(download_dir, r".*\.csv$")) file_path = os.path.join(download_dir, filename) return file_path
def download_file(self, extension: str, timeout: Optional[int] = 60): # Wait for csv to download wait = WebDriverWait(self._driver, timeout) download_dir = self._driver.download_dir filename = wait.until( file_exists_in_dir(download_dir, r".*\.{}".format(extension))) file_path = os.path.join(download_dir, filename) return file_path
def wait_for_download(driver, timeout=60): """Wait for the report download to finish.""" wait = WebDriverWait(driver, timeout) download_dir = driver.download_dir filename = wait.until( file_exists_in_dir(download_dir, r"^%s$" % RAW_REPORT_NAME)) filepath = os.path.join(download_dir, filename) return filepath
def _download_zip_file(self, start_date, end_date): # These input fields come pre-filled and 'clear()' does not work for i in range(10): self._driver.find_element_by_id("FromDate").send_keys( Keys.BACKSPACE) for i in range(10): self._driver.find_element_by_id("ToDate").send_keys(Keys.BACKSPACE) time.sleep(1) date_format = "%m/%d/%Y" self._driver.find_element_by_id("FromDate").send_keys( start_date.strftime(date_format)) self._driver.find_element_by_id("ToDate").send_keys( end_date.strftime(date_format)) # If the scraper tries to go farther back than allowed, an error message is displayed # Clicking the calendar button will then set it to the earliest possible good date. minimal_date_error_xpath = ( "//p[contains(text(), 'Date should not be before minimal date')]") try: minimal_date_error = self._driver.find_element_by_xpath( minimal_date_error_xpath) if minimal_date_error.is_displayed(): log.warning( "Attempted start date %s before data exists", start_date.strftime(date_format), ) # The start and end calendar buttons have this same identifier, # so we're assuming the 'before' is found first calendar_button_xpath = "//button[@arialabel='change date']" calendar_button = self._driver.find_element_by_xpath( calendar_button_xpath) calendar_button.click() except NoSuchElementException: pass export_xpath = "//span[contains(text(), 'Export')]" self._driver.find(export_xpath, xpath=True).click() download_dir = self._driver.download_dir # Filename example: SoCalGas_Gas_60_Minute_7-7-19_8-7-20.zip # strftime pads with zeros, which doesn't work here start_date_string = (str(start_date.month) + "-" + str(start_date.day) + "-" + str(start_date.year)[2:]) end_date_string = (str(end_date.month) + "-" + str(end_date.day) + "-" + str(end_date.year)[2:]) expected_filename = ("SoCalGas_Gas_60_Minute_" + start_date_string + "_" + end_date_string + ".zip") log.info("\t\tDownloading file {}".format(expected_filename)) download_name = self._driver.wait(300).until( file_exists_in_dir(download_dir, expected_filename)) log.info("\t\tFile downloaded") # Truncate file extension return str(download_name)[:-4]
def wait_for_download(driver, timeout=60): """Wait for a download to finish. In particular, wait for a csv file to show up in the download directory. """ wait = WebDriverWait(driver, timeout) download_dir = driver.download_dir filename = wait.until(file_exists_in_dir(download_dir, r".*\.csv$")) filepath = os.path.join(download_dir, filename) return filepath
def download_csv(self) -> str: # in an svg and the div id seems to change download_csv_xpath = ( "//*[name() = 'div' and starts-with(@id, 'highcharts-')]" "/*[name() = 'div'][3]/*[name() = 'div']/*[name() = 'div'][2]") # download_csv = self.driver.find_element_by_xpath(download_csv_xpath) download_csv = self.driver.wait().until( EC.element_to_be_clickable((By.XPATH, download_csv_xpath))) download_csv.click() download_dir = self.driver.download_dir filename = self.driver.wait(60).until( file_exists_in_dir(download_dir, r".*\.csv$")) file_path = os.path.join(download_dir, filename) return file_path
def download_pdf_for_billing_row(self, billing_row: sce_pages.BillingDataRow): error_indicator = "BILL_NOT_AVAILABLE" # This is a helper function to help detect when a bill pdf is not available. # Somewhat clumsily, it returns a special flag when it detects the error page, # which we later check against the return value of the wait clause below. def download_error_page_visible(driver): locator = ( By.XPATH, "//react-energy-manager//div[contains(@class, 'ServiceAcctBillList__dialogboxError')]", ) query = driver.find_elements(*locator) if query: return error_indicator return None billing_row.selector.click() time.sleep(2) self._driver.find_element_by_id("viewBill").click() download_dir = self._driver.download_dir # Either the file will download, or SCE will show us an error modal indicating that the bill was # not available. If the error page is found, "result" will hold the value in error_indicator, # defined above. Else, it will hold the name of the file in the download directory. result = WebDriverWait(self._driver, 120).until( ec_or( download_error_page_visible, file_exists_in_dir(download_dir, r".*\.pdf$"), ) ) if result == error_indicator: # We need to make sure to close the modal that appears on error close_button_locator = ( By.XPATH, "//react-energy-manager//button[contains(@class, 'sceDialogBox__crossButtonDialogBox')]", ) self._driver.find_element(*close_button_locator).click() time.sleep(5) return None return os.path.join(download_dir, result)
def _export_data(self) -> str: log.info("Exporting data.") with IFrameSwitch(self._driver, "childFrame"): with IFrameSwitch(self._driver, "frame3"): self._driver.wait().until( EC.element_to_be_clickable( (By.XPATH, self.export_data_xpath))) self._driver.sleep(2) export_button = self._driver.find_element_by_xpath( self.export_data_xpath) self._driver.execute_script("arguments[0].click();", export_button) # Wait for csv to download download_dir = self._driver.download_dir filename = self._driver.wait(60).until( file_exists_in_dir(download_dir, r".*\.csv$")) file_path = os.path.join(download_dir, filename) return file_path
def download_data(self, meter_number: str) -> str: """Download data to the working directory. Click Download Data button. Saves to config.WORKING_DIRECTORY/15_minute_download.csv Rename the downloaded file to config.WORKING_DIRECTORY/{meter_number}.csv Return: the path of the downloaded csv file. """ # wait for the download button to be ready self.wait_until_ready(self.DownloadBtnSel) log.info("Beginning download...") self.find_element(self.DownloadBtnSel).click() # download filename is always 15_minute_download.csv for 15 minute intervals filename = "15_minute_download.csv" download_dir = "%s/current" % config.WORKING_DIRECTORY try: self._driver.wait(30).until( file_exists_in_dir( # end pattern with $ to prevent matching # filename.crdownload directory=download_dir, pattern=f"^{filename}$", )) except Exception: raise Exception("Unable to download file...") log.info("Download complete") csv_file_path = os.path.join(download_dir, meter_number + ".csv") # rename downloaded filename to {meter_number}.csv for # avoiding filename conflict in case of multiple accounts os.rename(os.path.join(download_dir, filename), csv_file_path) return csv_file_path
def wait_for_bill_download(self, bill_date): """Wait for document.pdf to download. Returns absolute path of the downloaded file. """ download_dir = config.WORKING_DIRECTORY + "/current" try: filename = self.driver.wait(30).until( file_exists_in_dir( directory=download_dir, pattern=r"^document.pdf$", )) except Exception: raise Exception("Unable to download file...") curr_filepath = os.path.join(download_dir, filename) # rename the file to avoid matching the wrong file in future filepath = os.path.join(download_dir, f"{bill_date}_bill.pdf") os.rename(curr_filepath, filepath) return filepath
def download_pdfs(self, start_date: date, end_date: date): download_dir = self._driver.download_dir downloaded_pdfs_dir = os.path.join(download_dir, "downloaded") os.makedirs(downloaded_pdfs_dir, exist_ok=True) data_table = self.find_element(self.DataTableSelector) data_rows = data_table.find_elements_by_css_selector("tbody > tr") for row in data_rows: bill_date = row.find_element_by_css_selector( 'td[data-title="Bill Date"]').text bill_date = parse_date(bill_date).date() if start_date < bill_date < end_date: view_bill_link = row.find_element_by_css_selector( 'td[data-title="View eBill"] > a') view_bill_link.click() filename = self._driver.wait().until( file_exists_in_dir(download_dir, r".*\.pdf$")) file_path = os.path.join(download_dir, filename) new_file_path = os.path.join(downloaded_pdfs_dir, filename) os.rename(file_path, new_file_path) log.info(f"file downloaded: {new_file_path}")
def download_bills( self, latest: date, utility_account: str, utility: str, gen_utility: Optional[str] = None, gen_utility_account_id: Optional[str] = None, ) -> List[BillPdf]: """Download bill PDFs for the specified date range.""" pdfs: List[BillPdf] = [] log.info("Opening billing history") click(self._driver, css_selector="#arrowBillPaymentHistory") self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot( BaseWebScraper.screenshot_path("bill history arrow")) wait_for_block_overlay(self._driver) log.info("Clicking 'view up to..' link") click(self._driver, css_selector=self.ViewMoreHistorySel) self.wait_until_ready(self.BillingHistoryTableSel) self._driver.screenshot(BaseWebScraper.screenshot_path("panels")) panels_count = len( self._driver.find_elements_by_css_selector(self.PanelxSel)) log.info(f"found {panels_count} panels in billing widget") # Rather than get all matching elements and iterate through, use index # and manually get element each time to help avoid stale element errors for i in range(0, panels_count): panel = self._driver.find_elements_by_css_selector( self.PanelxSel)[i] # check if is a payment panel panel_header = panel.find_element_by_css_selector(".panel-title") header_text = panel_header.text if "Payment" in header_text: log.debug(f"Skipping panel {i} (payment)") # skip if is a payment panel continue log.debug(f"Processing panel {i} (bill): {header_text}") link_elem = panel.find_element_by_css_selector( "div.pge_coc-dashboard-viewPay_billed_history_panel_viewBill_para_block" " a.viewBill") # Get date from the "data-date" attribute on link to download bill... # data-date is in milliseconds timestamp = int(link_elem.get_attribute("data-date")) / 1000.0 # when bill was issued bill_date = datetime.fromtimestamp(timestamp).date() # bill issued about a week after end date; use this window to match dates approx_bill_end = bill_date - timedelta(days=7) approx_bill_start = approx_bill_end - timedelta(days=30) log.debug(f"bill date={bill_date}") # cost is in second column cost_text = panel.find_element_by_css_selector( "td.text-right").text log.debug(f"cost text={cost_text}") # cost with $ and commas: $1,234.56 or -$1,234.56 cost = float(cost_text.replace("$", "").replace(",", "")) log.info(f"Found bill issued {bill_date} with cost ${cost}") if approx_bill_end <= latest: log.info( f"ignoring bill, date: {approx_bill_end} already download") continue try: click(self._driver, elem=link_elem) except ElementNotInteractableException: log.info("Download link not visible; looking for other") link_elem = panel.find_element_by_css_selector( "div#billSummaryContainer a.viewBill") click(self._driver, elem=link_elem) except ElementClickInterceptedException as exc: log.info("download link failed: %s %s", exc, exc.msg) close_modal(self._driver) continue last4 = self.account_id.split("-")[0][6:10] filename = f"{last4}custbill{bill_date.strftime('%m%d%Y')}.pdf" download_dir = "%s/current" % config.WORKING_DIRECTORY try: self._driver.wait(60).until( file_exists_in_dir( # end pattern with $ to prevent matching filename.crdownload directory=download_dir, pattern=f"^{filename}$", )) except TimeoutException: log.error( f"ERROR waiting for file {filename} to download...skipping" ) # close the download failed modal if there is one close_modal(self._driver) continue with open("%s/%s" % (download_dir, filename), "rb") as f: key = hash_bill(self.account_id, approx_bill_start, approx_bill_end, cost, "", "") upload_bill_to_s3( file_handle=f, key=key, source="pge.com", statement=bill_date, utility=utility, utility_account_id=utility_account, gen_utility=gen_utility, gen_utility_account_id=gen_utility_account_id, ) log.info(f"Uploaded {filename} to {key}") pdfs.append( BillPdf( utility_account_id=utility_account, gen_utility_account_id=gen_utility, start=approx_bill_start, end=approx_bill_end, statement=bill_date, s3_key=key, )) return pdfs
def get_bills(self, utility: str, utility_account_id: str) -> List[BillingDatum]: billing_data = [] available_dates = self.driver.find_elements( By.CSS_SELECTOR, "table.table-alt a.bill-view-link") available_dates = [parse_date(i.text).date() for i in available_dates] log.info("available dates: %s", [dt.strftime("%Y-%m-%d") for dt in available_dates]) xpath_locators = { # Finds the last KWH reading under Total Usage column "cost": "//table[contains(., 'NEW CHARGES')]/tbody/tr/td[3]", "used": "(//table[contains(.,'USAGE')]//tr/td[contains(., 'KWH')])", "usage_kw": "//table[contains(.,'USAGE')]//tr/td[contains(.,'KW') and not(contains(.,'KWH'))]", } # loop through dates in table in ascending order for pdf_date in reversed(available_dates): # skip if the date isn't in the specified range if not (self.start_date <= pdf_date <= self.end_date): log.debug("skipping date outside range: %s", pdf_date) continue view_bill_link = self.driver.find_element_by_xpath( '//a[.="%s"]' % pdf_date.strftime("%m/%d/%Y")) scroll_to(self.driver, view_bill_link) self.driver.sleep(0.5) view_bill_link.click() self.driver.wait(30).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "div.billImage"))) start_date = None end_date = None cost = None used = None peak = None dates_line_text: str = self.driver.find_element_by_xpath( "//td[contains(., 'Service From:')]").text dates_match = re.search( r"Service From: (?P<from>\w+ \d\d) to (?P<to>\w+ \d\d) \(\d\d Days\)", dates_line_text, ) if dates_match: # if from month is December, use previous year year = (pdf_date.year - 1 if "dec" in dates_match.group("from").lower() else pdf_date.year) start_date = parse_date("%s %s" % (dates_match.group("from"), year)) end_date = parse_date( dates_match.group("to") + pdf_date.strftime(" %Y")) cost_match = self.driver.find(xpath_locators["cost"], xpath=True) if cost_match: cost = cost_match.text cost = float(cost.replace("$", "").replace(",", "")) kwh_usages = [] for match in self.driver.find_all(xpath_locators["used"], xpath=True): # include only if it has a reading values as siblings; exclude credit line items parent = match.find_element_by_xpath("..") # meter number, previous reading, current reading readings_text = "" for idx, child in enumerate( parent.find_elements_by_xpath(".//td")): log.debug("\t%s\t%s", idx, child.text.strip()) readings_text += child.text.strip() if idx == 2: break if not readings_text: log.info("skipping non-reading line item: %s", parent.text) continue kwh_value = float( match.text.replace("KWH", "").replace(",", "").strip()) kwh_usages.append(kwh_value) if kwh_usages: used = sum(kwh_usages) kw_usages = [] for usage_kw_match in self.driver.find_all( xpath_locators["usage_kw"], xpath=True): kw_usages.append( float( usage_kw_match.text.replace("KW", "").replace(",", "").strip())) if kw_usages: peak = max(kw_usages) data = BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=end_date - timedelta(days=1), cost=cost, peak=peak, used=used, items=None, attachments=None, utility_code=None, ) self.driver.find("a#billImageToPrint").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) # the filename of the printed pdf is f"{current page title}.pdf" self.driver.execute_script("window.print();") try: file_exists_in_dir(directory=self.download_dir, pattern=r"^Bill View Bill Image.pdf$") except Exception: raise Exception("Unable to download file for %s" % pdf_date) curr_path = os.path.join(self.download_dir, "Bill View Bill Image.pdf") new_path = os.path.join( self.download_dir, f"bill_{pdf_date.strftime('%Y-%m-%d')}.pdf") os.rename(curr_path, new_path) log.info("parsed bill for %s - %s", data.start, data.end) self.driver.find("a#close").click() self.driver.sleep(1) self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.sleep(1) # upload PDF: key = hash_bill( utility_account_id, data.start, data.end, data.cost, data.peak, data.used, ) with open(new_path, "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="www.duke-energy.com", statement=data.end, utility=utility, utility_account_id=utility_account_id, ) if attachment_entry: data = data._replace(attachments=[attachment_entry]) billing_data.append(data) # Click Bill Information in breadcrumbs to go back to bills list page self.driver.find("a#billInformation").click() return billing_data
def energy_manager_basic_usage_action( self, page: sce_pages.SceEnergyManagerBasicUsagePage): sce_pages.detect_and_close_survey(self._driver) rval = page.select_service_id(self.service_id) log.info("Result of select service id %s: %s", self.service_id, rval) self.screenshot("select_service_id") page.configure_report() date_range = DateRange(self.start_date, self.end_date) # the website seems to time out when trying to get more than this amount of data interval_size = relativedelta(days=7) timeline = Timeline(self.start_date, self.end_date) for idx, subrange in enumerate( date_range.split_iter(delta=interval_size)): log.info("Requesting interval data for dates: %s", subrange) start = subrange.start_date end = subrange.end_date page.set_time_range(start, end) self.screenshot("set_time_range") try: page.generate_report() time.sleep(5) WebDriverWait(self._driver, 180).until( EC.invisibility_of_element_located( sce_pages.GenericBusyIndicatorLocator)) self.screenshot(f"interval{idx}") except Exception as e: raise sce_errors.EnergyManagerReportException( "Failed to load data from Energy Manager") from e try: page.raise_on_report_error() except sce_errors.EnergyManagerDataNotFoundException: log.info("No data found for this time range, continuing...") # If a given date range has no interval data, just move on to the next one continue log.info("Downloading the interval data report.") self.clear_csv_downloads() try: page.download_report() except Exception as e: raise sce_errors.EnergyManagerReportException( "Failed to load data from Energy Manager") from e try: # Wait two minutes for the download to finish wait = WebDriverWait(self._driver, 120) csv_file_name = wait.until( file_exists_in_dir(self._driver.download_dir, r".*\.csv")) csv_file_path = os.path.join(self._driver.download_dir, csv_file_name) for reading in parse_sce_csv_file(csv_file_path, self.service_id): timeline.insert(reading.dt, reading.value) except TimeoutException: raise TimeoutException( "Downloading interval data from Energy Manager failed.") self.interval_data_timeline = timeline
def get_bills(self, account_id: str, start: date, end: date) -> List[BillingDatum]: """Get bills from the table. for each row: get end from Read date column (date) get start date from end date - (Days column (date) - 1) get statement date from Bill date column (date) if not start - end overlaps passed in start / end, continue get peak from On-peak Billed kW (float) get used from (Off-peak kWh + Shoulder kWh + On-peak kWh) (float) get cost from New charges (float) click eye icon to download PDF; wait for download to complete to self.driver.download_dir """ WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(self.UsageTableBodyLocator)) usage_table_rows = self.driver.find_elements( *self.UsageTableRowsLocator) bill_data: List[BillingDatum] = [] self.driver.screenshot(BaseWebScraper.screenshot_path("bill table")) for row in usage_table_rows: cols = row.find_elements_by_tag_name("td") cols = [ c for c in cols if "display: none" not in c.get_attribute("style") ] col = lambda x: cols[x].text to_num = lambda x: "".join(d for d in col(x) if d.isdigit() or d == ".") to_float = lambda x: float(to_num(x)) if len(to_num(x)) > 0 else 0 log.debug(f"statement={col(1)} end={col(2)} days={col(7)}") # statement date statement_date = date_parser.parse(col(1)).date() # bill end period_year = statement_date.year if statement_date.month == 1 and col(2).startswith("12"): period_year = statement_date.year - 1 end_str = f"{col(2)}/{period_year}" bill_end = date_parser.parse(end_str).date() # bill start bill_start = bill_end - timedelta(days=int(to_float(7)) - 1) log.debug(f"start={bill_start} end={bill_end}") if not self._overlap(start, end, bill_start, bill_end): log.info( f"skipping bill {bill_start} - {bill_end}: does not overlap requested range {start} - {end}" ) continue # cost new_charges = to_float(8) # used used = to_float(4) + to_float(5) + to_float(6) # peak peak = to_float(3) bill_datum = BillingDatum( start=bill_start, end=bill_end, statement=statement_date, cost=new_charges, used=used, peak=peak, items=None, attachments=None, utility_code=None, ) try: bill_pdf_name = "SRPbill{}{}.pdf".format( statement_date.strftime("%B"), statement_date.year) pdf_download_link = cols[0].find_element_by_tag_name("a") scroll_to(self.driver, pdf_download_link) pdf_download_link.click() log.info("looking for %s in %s", bill_pdf_name, self.driver.download_dir) self.driver.wait(60).until( file_exists_in_dir(self.driver.download_dir, bill_pdf_name)) except Exception as e: raise Exception( f"Failed to download bill {bill_pdf_name} for statement date {statement_date}:\n {e}" ) log.info( f"Bill {bill_pdf_name} for statement date {statement_date} downloaded successfully" ) attachment_entry = None # open downloaded PDF and upload if config.enabled("S3_BILL_UPLOAD"): key = hash_bill_datum(account_id, bill_datum) with open(f"{self.driver.download_dir}/{bill_pdf_name}", "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="myaccount.srpnet.com", statement=bill_datum.statement, utility="utility:salt-river-project", utility_account_id=account_id, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) return bill_data
def export_csv(self, service_id, start: date, end: date) -> str: """Export CSV file and return path to downloaded file. Select meter service_id from Meter drop down Click triple bar button, then Export All Data (CSV) Adjust end date if needed: get latest to date from form, parse into a date, and set end to max(end, form_max_dt) Set from and to dates (mm/dd/yyyy) and click Download. Wait for file to download (.csv) Return path to csv file """ self.driver.wait().until( EC.invisibility_of_element_located( (By.CSS_SELECTOR, "div.spinner-container"))) self.driver.sleep(2) self.driver.find(self.MeterDropdownSelector).click() # wait for loading self.driver.sleep(5) meter_dropdown_selector = f'//table[@id="sdp_selector_table"]//a[contains(@class,"sdp-dropdown") and contains(.,"{service_id}")]' meter_id_dropdown_option = self.driver.find(meter_dropdown_selector, xpath=True) scroll_to(self.driver, meter_id_dropdown_option) if not meter_id_dropdown_option: raise DataSourceConfigurationError( f"No meter found with service_id: {service_id}") meter_id_dropdown_option.click() self.driver.wait().until( EC.invisibility_of_element_located( (By.CSS_SELECTOR, "div.spinner-container"))) self.driver.sleep(2) self.driver.find_element(*self.ExportCSVDropDownButtonLocator).click() self.driver.sleep(2) self.driver.find(self.ExportCSVLinkSelector).click() self.driver.wait().until( EC.visibility_of_element_located( (By.CSS_SELECTOR, self.FromDateInputSelector))) self.driver.sleep(2) from_date_input_field = self.driver.find(self.FromDateInputSelector) from_date_input_field.clear() from_date_input_field.send_keys(start.strftime("%m/%d/%Y")) to_date_input_field = self.driver.find(self.ToDateInputSelector) max_available_to_date = parse_date( to_date_input_field.get_attribute("placeholder")).date() end = max(max_available_to_date, end) to_date_input_field.clear() to_date_input_field.send_keys(end.strftime("%m/%d/%Y")) self.driver.find(self.DownloadButtonSelector).click() # Wait for csv to download download_dir = self.driver.download_dir filename = self.driver.wait().until( file_exists_in_dir(download_dir, r".*\.{}$".format("csv"))) return os.path.join(download_dir, filename)
def handle_pdfs( self, service_id, start: date, end: date, utility, utility_account_id, first_page=False, ) -> List[BillingDatum]: pdf_links_xpath = "//a[contains(text(), 'View Bill')]" download_dir = self.driver.download_dir bill_data: Optional[List[BillingDatum]] bill_data = [] # The most recent bill link is a special case. # It does not download directly but opens a new page with a download link. first_link_found = False log.info("first_page is %s", first_page) if not first_page: first_link_found = True if not first_link_found: log.debug("looking for pdf_link_1 %s", pdf_links_xpath) pdf_link_1 = WebDriverWait(self.driver, 15).until( ec.presence_of_element_located((By.XPATH, pdf_links_xpath))) log.info("Downloading most recent bill; scroll to %s", pdf_link_1.location["y"]) self.driver.execute_script("window.scrollTo(0," + str(pdf_link_1.location["y"]) + ")") WebDriverWait(self.driver, 15).until( ec.element_to_be_clickable((By.XPATH, pdf_links_xpath))) self.driver.screenshot( BaseWebScraper.screenshot_path("most recent bill")) pdf_link_1.click() if (self.driver.current_url == "https://portlandgeneral.com/secure/view-bill"): download_bill_button_xpath = ( "//span[contains(text(), 'Download bill (PDF)')]") log.debug("scroll to scrollHeight/2") self.driver.execute_script( "window.scrollTo(0, window.scrollY+(document.body.scrollHeight/2))" ) time.sleep(2) log.debug("looking for download button %s", download_bill_button_xpath) download_bill_button = WebDriverWait(self.driver, 25).until( ec.presence_of_element_located( (By.XPATH, download_bill_button_xpath))) try: log.debug("clicking download") download_bill_button.click() # div[role="alert"] with text No bill found. except ElementClickInterceptedException as exc: log.debug("click intercepted: %s", exc) close_modal(self.driver) download_bill_button.click() time.sleep(1) filename = self.driver.wait(60).until( file_exists_in_dir(download_dir, r".*\.pdf$")) file_path = os.path.join(download_dir, filename) log.info("Processing most recent bill: %s", filename) single_bill = extract_bill_data(file_path, service_id, utility, utility_account_id) bill_data.append(single_bill) log.info( "first bill: %s - %s cost=%s", single_bill.start, single_bill.end, single_bill.cost, ) bill_history_button_xpath = ( "//span[contains(text(), 'Billing and payment history')]") bill_history_button = WebDriverWait(self.driver, 25).until( ec.element_to_be_clickable( (By.XPATH, bill_history_button_xpath))) log.info("Returning to bill history page") bill_history_button.click() pdf_links = WebDriverWait(self.driver, 25).until( ec.presence_of_all_elements_located((By.XPATH, pdf_links_xpath))) log.info("Found %s pdfs on page", len(pdf_links)) self.driver.screenshot(BaseWebScraper.screenshot_path("found pdfs")) for link in pdf_links: if not first_link_found: first_link_found = True continue self.driver.execute_script("window.scrollTo(0," + str(link.location["y"]) + ")") time.sleep(2) if not self.seen_survey and close_survey(self.driver): self.seen_survey = True close_survey(self.driver) # get sibling node for date range text: 12/10/2020 - 01/12/2021 match = re.match( r"(\d+/\d+/\d+) - (\d+/\d+/\d+)", link.find_element_by_xpath("../p").text, ) from_dt = parse_time(match.group(1)) to_dt = parse_time((match.group(2))) if to_dt < start: log.info("stoppinng: %s bill is before start", to_dt) break # filename is View_Bill-Dec. 10, 2020_Jan. 12, 2021.pdf filename = "View_Bill-%s_%s.pdf" % ( from_dt.strftime("%b. %d, %Y"), to_dt.strftime("%b. %d, %Y"), ) link.click() self.driver.wait(90).until( file_exists_in_dir(download_dir, filename)) file_path = os.path.join(download_dir, filename) period_start, period_end = extract_bill_period(file_path) # If the bill starts after our end date, skip it if period_start > end: continue # If the bill ends before our start date, break and return (finding where to end) if period_end < start: break if not period_start or not period_end: log.info( "Could not determine bill period for pdf %s. Skipping" % file_path) continue single_bill = extract_bill_data(file_path, service_id, utility, utility_account_id) bill_data.append(single_bill) log.info( "added bill: %s - %s cost=%s", single_bill.start, single_bill.end, single_bill.cost, ) non_overlapping_bills = _adjust_bill_dates(bill_data) return non_overlapping_bills
def reports_page_action( self, reports_page: saltriver_pages.SaltRiverReportsPage): log.info("goto_meter_profiles") reports_page.goto_meter_profiles() meter_page = saltriver_pages.MeterProfilesPage(self._driver) WebDriverWait(self._driver, 30).until(page_is_ready(meter_page)) self.screenshot("meter profiles") log.info("get meters") meters = meter_page.get_meters() meter, channel = self.find_matching_meter_and_channel( meters, self.meter_id, self.channel_id) self.screenshot("meter and channel") log.info("goto reports") meter_page.goto_reports() WebDriverWait(self._driver, 30).until(page_is_ready(reports_page)) time.sleep(10) log.info("looking for interval download") reports_page.goto_interval_download() interval_download_page = saltriver_pages.IntervalDownloadPage( self._driver) WebDriverWait(self._driver, 30).until(page_is_ready(interval_download_page)) self.screenshot("interval download") log.info("interval download page is ready") interval_download_page.basic_configuration() interval_download_page.select_meter_by_id(meter.meter_id) start = self.start_date end = self.end_date # Snap the scraper start date to the data start date for the selected meter/channel. if start < channel.data_start: start = channel.data_start if start > end: raise InvalidDateRangeError( "The start date must be before the end date (start='{}', end='{}')" .format(start, end)) # Pull out data 30 days at a time date_range = DateRange(start, end) interval_size = relativedelta(days=30) timeline = Timeline(start, end) for sub_range in date_range.split_iter(delta=interval_size): log.info("downloading %s", sub_range) self.clear_csv_downloads() interval_download_page.set_date_range(sub_range.start_date, sub_range.end_date) interval_download_page.download_interval_data() self.screenshot("download %s" % sub_range.end_date.strftime("%Y%m%d")) try: wait = WebDriverWait(self._driver, 180) csv_file_name = wait.until( file_exists_in_dir(self._driver.download_dir, r".*\.csv")) csv_file_path = os.path.join(self._driver.download_dir, csv_file_name) for (when, reading) in parse_spatia_interval_csv( csv_file_path, channel.id): # The CSV file reports readings at the end of each fifteen minute interval. So the first reading # of the day occurs at 00:15. and the last at midnight. We want to report the readings at the # _start_ of each interval, thus we subtract 15 minutes here. when = when - timedelta(minutes=15) timeline.insert(when, reading) except TimeoutException: raise TimeoutException( "Downloading interval data from SPATIA failed.") self.interval_data_timeline = timeline