def upload_readings(transforms, meter_oid: int, scraper: str, task_id: str, readings) -> Status: updated: List[MeterReading] = [] if readings: readings = interval_transform.transform(transforms, task_id, scraper, meter_oid, readings) log.info("writing interval data to the database for %s %s", scraper, meter_oid) updated = MeterReading.merge_readings( MeterReading.from_json(meter_oid, readings)) if task_id and config.enabled("ES_INDEX_JOBS"): index.set_interval_fields(task_id, updated) log.info("Final Interval Summary") for when, intervals in readings.items(): none_count = sum(1 for x in intervals if x is None) factor = (24 / len(intervals)) if len(intervals) > 0 else 1.0 kWh = sum(x for x in intervals if x is not None) * factor log.info("%s: %d intervals. %.1f net kWh, %d null values." % (when, len(intervals), kWh, none_count)) path = os.path.join(config.WORKING_DIRECTORY, "readings.csv") with open(path, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(["Service", "Date", "Readings"]) for when, intervals in readings.items(): writer.writerow([meter_oid, str(when)] + [str(x) for x in intervals]) log.info("Wrote interval data to %s." % path) if updated: return Status.SUCCEEDED return Status.COMPLETED
def make_attachments( source_urls: List[str], statement: date, utility: Optional[str] = None, account_id: Optional[str] = None, gen_utility: Optional[str] = None, gen_utility_account_id: Optional[str] = None, ): if not config.enabled("S3_BILL_UPLOAD"): return None if not source_urls: return None s3_keys = [statement_to_s3(url) for url in source_urls] attachments = [ AttachmentEntry( key=key, kind="bill", format="PDF", source="urjanet", statement=statement.strftime("%Y-%m-%d"), utility=utility, utility_account_id=account_id, gen_utility=gen_utility, gen_utility_account_id=gen_utility_account_id, ) for key in s3_keys if key is not None ] if attachments: return attachments return None
def upload_file_to_s3(body, bucket, key, file_display_name=None, content_type=None): """Upload a file to s3 Stores a PDF into a S3 bucket, under the specified key, if that key doesn't already exist in the bucket. Args: body: The contents of the pdf file. Should be a binary file-like object (e.g. the result of opening a file in binary mode). bucket: The name of the bucket to upload the file into. key: The key under which to store the file file_display_name: The "original" filename, placed in the "content-disposition" metadata of the upload. This argument is optional, and if not specified defaults to the key name. content_type: set as ContentType metadata for the file; defaults to application/pdf Returns: The name of the key where the file is stored. Should be equal to the "key" argument. """ log.debug( "S3 Upload Requested: key=%s, bucket=%s, display_name=%s", key, bucket, file_display_name, ) if not config.enabled("S3_BILL_UPLOAD"): log.debug("Bill upload disabled, skipping S3 upload.") return None # see if already fetched/uploaded if s3_key_exists(bucket, key): log.debug("Key %s already exists in bucket %s.", key, bucket) return key if file_display_name is None: file_display_name = key client = boto3.client("s3") resp = client.put_object( Body=body, Bucket=bucket, ContentDisposition="inline; filename=%s" % file_display_name, ContentType=content_type, Key=key, StorageClass="STANDARD_IA", ) log.debug("Attempted S3 upload to %s %s: %s", bucket, key, resp) return key
def remove_file_from_s3(bucket: str, key: str) -> None: if not config.enabled("S3_BILL_UPLOAD"): log.debug("Bill upload disabled, skipping S3 remove.") return client = boto3.client("s3") try: client.delete_object(Bucket=bucket, Key=key) except: # noqa: E722 log.exception("Request to remove file %s/%s from S3 failed.", bucket, key)
def s3_key_exists(bucket, key): """Determine if a key exists in an S3 bucket.""" if not config.enabled("S3_BILL_UPLOAD"): return False client = boto3.client("s3") try: client.head_object(Bucket=bucket, Key=key) return True except: # noqa: E722 return False
def _execute(self): config: SmdPartialBillingScraperConfiguration = self._configuration meter = config.meter usage_points = relevant_usage_points(meter) log.info("Identified %s relevant usage point(s): %s", len(usage_points), usage_points) query = db.session.query(SmdBill).filter( SmdBill.usage_point.in_(usage_points)) if self.start_date: start = self.start_date end = max(start, self.end_date or date.today()) if end - self.start_date <= timedelta(days=60): start = start - timedelta(days=60) log.info("Adjusting start date to %s.", start) query = query.filter(start <= SmdBill.start) if self.end_date: query = query.filter(SmdBill.start <= self.end_date) query = query.order_by(SmdBill.published) log.info("Identified %d raw SMD bills relevant to this meter.", query.count()) # It often happens that we receive several versions of the same bill across multiple files. # The first thing we need to do is order the bills by publication date, so we can decide # which SmdBill record is the correct one for our chosen date. unified_bills: List[SmdBill] = SmdBill.unify_bills(query) adjusted_bills: List[SmdBill] = SmdBill.adjust_single_day_bills( unified_bills) partial_bills = [ b.to_billing_datum(self.service) for b in adjusted_bills ] if partial_bills: log.debug( "Identified %s partial bills in Share My Data for meter %s (%s).", len(partial_bills), meter.name, meter.oid, ) if datafeeds_config.enabled("S3_BILL_UPLOAD"): partial_bills = self.attach_corresponding_urja_pdfs( partial_bills) return Results(tnd_bills=partial_bills)
def upload_partial_bills( meter: Meter, task_id: str, billing_data: BillingData, bill_type: PartialBillProviderType, ) -> Status: """ Goes through billing_data and uploads new partial bills directly to the partial bills table. If a new partial bill differs from an existing partial bill, a new bill is created, rather than overwriting the old one. New partial bills are written directly to the db; they do not go through platform. """ log.info("Starting processing of partial bill scraper results.") processor = PartialBillProcessor(meter, bill_type, billing_data) status = processor.process_partial_bills() processor.log_summary() if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Updating billing range in Elasticsearch.") index.update_billing_range(task_id, billing_data) return status
def upload_bills( meter_oid: int, service_id: str, task_id: str, scraper: str, billing_data: BillingData, ) -> Status: cur_most_recent = _latest_closing(service_id) _, change_records = _upload_bills_to_services(service_id, billing_data) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Updating billing range in Elasticsearch.") index.update_billing_range(task_id, billing_data) if change_records: index.index_bill_records(scraper, change_records) billing_data = verify_bills(meter_oid, billing_data) title = "Final Scraped Summary" show_bill_summary(billing_data, title) path = os.path.join(config.WORKING_DIRECTORY, "bills.csv") end = date(year=1900, month=1, day=1) with open(path, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(["Service ID", "Start", "End", "Cost", "Used", "Peak"]) for b in billing_data: writer.writerow( [service_id, b.start, b.end, b.cost, b.used, b.peak]) if type(b.end) == datetime: end = max(b.end.date(), end) # type: ignore else: if b.end > end: end = b.end log.info("Wrote bill data to %s." % path) if cur_most_recent and (end > cur_most_recent): return Status.SUCCEEDED return Status.COMPLETED
def get_bills(self, account_id: str, start: date, end: date) -> List[BillingDatum]: """Get bills from the table. for each row: get end from Read date column (date) get start date from end date - (Days column (date) - 1) get statement date from Bill date column (date) if not start - end overlaps passed in start / end, continue get peak from On-peak Billed kW (float) get used from (Off-peak kWh + Shoulder kWh + On-peak kWh) (float) get cost from New charges (float) click eye icon to download PDF; wait for download to complete to self.driver.download_dir """ WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(self.UsageTableBodyLocator)) usage_table_rows = self.driver.find_elements( *self.UsageTableRowsLocator) bill_data: List[BillingDatum] = [] self.driver.screenshot(BaseWebScraper.screenshot_path("bill table")) for row in usage_table_rows: cols = row.find_elements_by_tag_name("td") cols = [ c for c in cols if "display: none" not in c.get_attribute("style") ] col = lambda x: cols[x].text to_num = lambda x: "".join(d for d in col(x) if d.isdigit() or d == ".") to_float = lambda x: float(to_num(x)) if len(to_num(x)) > 0 else 0 log.debug(f"statement={col(1)} end={col(2)} days={col(7)}") # statement date statement_date = date_parser.parse(col(1)).date() # bill end period_year = statement_date.year if statement_date.month == 1 and col(2).startswith("12"): period_year = statement_date.year - 1 end_str = f"{col(2)}/{period_year}" bill_end = date_parser.parse(end_str).date() # bill start bill_start = bill_end - timedelta(days=int(to_float(7)) - 1) log.debug(f"start={bill_start} end={bill_end}") if not self._overlap(start, end, bill_start, bill_end): log.info( f"skipping bill {bill_start} - {bill_end}: does not overlap requested range {start} - {end}" ) continue # cost new_charges = to_float(8) # used used = to_float(4) + to_float(5) + to_float(6) # peak peak = to_float(3) bill_datum = BillingDatum( start=bill_start, end=bill_end, statement=statement_date, cost=new_charges, used=used, peak=peak, items=None, attachments=None, utility_code=None, ) try: bill_pdf_name = "SRPbill{}{}.pdf".format( statement_date.strftime("%B"), statement_date.year) pdf_download_link = cols[0].find_element_by_tag_name("a") scroll_to(self.driver, pdf_download_link) pdf_download_link.click() log.info("looking for %s in %s", bill_pdf_name, self.driver.download_dir) self.driver.wait(60).until( file_exists_in_dir(self.driver.download_dir, bill_pdf_name)) except Exception as e: raise Exception( f"Failed to download bill {bill_pdf_name} for statement date {statement_date}:\n {e}" ) log.info( f"Bill {bill_pdf_name} for statement date {statement_date} downloaded successfully" ) attachment_entry = None # open downloaded PDF and upload if config.enabled("S3_BILL_UPLOAD"): key = hash_bill_datum(account_id, bill_datum) with open(f"{self.driver.download_dir}/{bill_pdf_name}", "rb") as pdf_data: attachment_entry = upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="myaccount.srpnet.com", statement=bill_datum.statement, utility="utility:salt-river-project", utility_account_id=account_id, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) return bill_data
def attach_bill_pdfs( meter_oid: int, task_id: str, meter_only: bool, pdfs: List[BillPdf], ) -> Status: """Attach a list of bill PDF files uploaded to S3 to bill records.""" if not pdfs: return Status.COMPLETED count = 0 unused = [] for pdf in pdfs: log.info( "bill PDF for utility_account_id=%s statement=%s", pdf.utility_account_id, pdf.statement, ) if meter_only: # Attach PDFs to bills on this meter's service only. Matches up PDF start dates with PartialBill/ # Bill start dates with a small buffer. bill_query = db.session.query(Bill).filter( Bill.service == Meter.service, Meter.oid == meter_oid, Bill.initial >= pdf.start - timedelta(days=1), Bill.initial <= pdf.start + timedelta(days=1), ) partial_bill_query = db.session.query(PartialBill).filter( PartialBill.service == Meter.service, Meter.oid == meter_oid, PartialBill.initial >= pdf.start - timedelta(days=1), PartialBill.initial <= pdf.start + timedelta(days=1), PartialBill.superseded_by.is_(None), PartialBill.visible.is_(True), PartialBill.provider_type == PartialBillProviderType.TND_ONLY.value, ) if not bill_query.count() and not partial_bill_query.count(): log.warning( "no bills found for utility_account_id %s %s-%s", pdf.utility_account_id, pdf.start, pdf.end, ) bill_attach_status = add_attachment_to_bills(pdf, bill_query) partial_attach_status = add_attachment_to_bills( pdf, partial_bill_query) if (AttachStatus.best([bill_attach_status, partial_attach_status ]) == AttachStatus.ATTACHED): # Only increase count if attachments were updated. # Not adding any PDF's to "unused" because attachment could be in use on another meter. count += 1 else: # Attach PDF's to potentially multiple bills on multiple services with the same utility account id. # Attach PDFs to bills on account that ended recently before the statement date: query = (db.session.query(Bill).filter( UtilityService.utility_account_id == pdf.utility_account_id ).filter(UtilityService.oid == Bill.service).filter( Bill.closing > pdf.statement - timedelta(days=14)).filter( Bill.closing <= pdf.statement)) bill_count = query.count() if not bill_count: log.warning( "no bills found for utility_account_id %s %s-%s", pdf.utility_account_id, pdf.start, pdf.end, ) attached = add_attachment_to_bills(pdf, query) if attached == AttachStatus.ATTACHED: count += 1 elif attached == AttachStatus.NOT_ATTACHED: unused.append(pdf.s3_key) log.info("attached %s/%s pdfs", count, len(pdfs)) for key in unused: remove_file_from_s3(config.BILL_PDF_S3_BUCKET, key) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Updating billing range in Elasticsearch.") index.update_bill_pdf_range(task_id, meter_oid, pdfs) if count: return Status.SUCCEEDED return Status.COMPLETED
def extract_bill_data(pdf_filename, service_id, utility, utility_account_id) -> Optional[BillingDatum]: # this function should upload the file to s3 to set attachments? try: text = pdf_to_str(pdf_filename) except PDFSyntaxError: log.exception("Downloaded bill file failed to parse as a PDF.") return None current_charges_pattern = "Current Charges(.*?)Cycle" for line in (re.search(current_charges_pattern, text, re.DOTALL).group(1).split("\n")): # get the last number if re.match(r"[\d,\.]", line.strip()): current_charges = line.strip().replace(",", "") period_start, period_end = extract_bill_period(pdf_filename) usage_pattern = r"Energy Charges \((\d*) kWh\)" usage = re.search(usage_pattern, text).groups()[0] on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW" on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0] offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW" offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0] bill_attachment = [] if config.enabled("S3_BILL_UPLOAD"): log.info("S3_BILL_UPLOAD is enabled") with open(pdf_filename, "rb") as f: key = hash_bill( service_id, period_start, period_end, _format_number(current_charges), 0, _format_number(usage), ) # no statement date; use end date bill_attachment.append( upload_bill_to_s3( f, key, source="portlandgeneral.com", statement=period_end, utility=utility, utility_account_id=utility_account_id, )) log.info("Uploaded bill %s to s3", bill_attachment) bill = BillingDatum( start=period_start, end=period_end, statement=period_end, cost=_format_number(current_charges), used=_format_number(usage), peak=max( float(on_peak_demand), float(offpeak_demand), ), items=[], attachments=bill_attachment, utility_code=None, ) return bill
def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum: text = pdfparser.pdf_to_str(pdf_filename) used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @" cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @" # date format: m/d/yyyy date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}" dates_pattern = ( r"Total Current Charges.+?" fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})" fr"(?P<due_date>{date_pattern})" fr"(?P<statement_date>{date_pattern})") dates_match = re.search(dates_pattern, text) if not dates_match: raise InvalidMeterDataException( f"Couldn't parse dates from pdf: {text}") _dates = dates_match.group("read_date_start", "read_date_end", "statement_date") start_date, end_date, statement_date = [ parse_date(_date).date() for _date in _dates ] used_match = re.search(used_pattern, text) if not used_match: raise InvalidMeterDataException( "fCouldn't parse usage from pdf: {text}") used_text = used_match.group("units_used") used = float(used_text.replace(",", "").replace("$", "")) cost_match = re.search(cost_pattern, text) if not cost_match: raise InvalidMeterDataException( f"Couldn't parse cost from pdf: {text}") cost_text = cost_match.group("water_charges") cost = float(cost_text.replace(",", "").replace("$", "")) if config.enabled("S3_BILL_UPLOAD"): key = hash_bill(account_id, start_date, end_date, cost, 0, used) with open(pdf_filename, "rb") as pdf_data: attachments = [ upload_bill_to_s3( BytesIO(pdf_data.read()), key, source="customerconnect.poway.org", statement=statement_date, utility="utility:city-of-poway", utility_account_id=account_id, ) ] else: attachments = [] return BillingDatum( start=start_date, end=end_date - timedelta(days=1), statement=statement_date, cost=cost, peak=None, used=used, items=None, attachments=attachments, utility_code=None, )
def run_datafeed( scraper_class, account: SnapmeterAccount, meter: Meter, datasource: MeterDataSource, params: dict, configuration=None, task_id=None, transforms: Optional[List[Transforms]] = None, disable_login_on_error: Optional[bool] = False, notify_on_login_error: Optional[bool] = True, meter_only: Optional[bool] = False, ) -> Status: transforms = [] if transforms is None else transforms bill_handler = ft.partial( upload_bills, meter.oid, meter.utility_service.service_id, task_id, datasource.name, ) readings_handler = ft.partial(upload_readings, transforms, meter.oid, datasource.name, task_id) pdfs_handler = ft.partial(attach_bill_pdfs, meter.oid, task_id, meter_only) partial_bill_handler = ft.partial(upload_partial_bills, meter, task_id) date_range = DateRange( *iso_to_dates(params.get("data_start"), params.get("data_end"))) parent: AccountDataSource = None if datasource.account_data_source: parent = datasource.account_data_source credentials = Credentials(parent.username, parent.password) if not datasource.account_data_source.enabled: raise DataSourceConfigurationError( "%s scraper for %s is disabled" % (datasource.account_data_source.name, meter.oid)) else: credentials = Credentials(None, None) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Uploading task information to Elasticsearch.") doc = index.starter_doc(meter.oid, datasource) doc["start_date"] = date_range.start_date doc["end_date"] = date_range.end_date doc["meter_data_source"] = datasource.oid if configuration: doc.update({ "billScraper": configuration.scrape_bills or configuration.scrape_partial_bills, "intervalScraper": configuration.scrape_readings, }) index.index_etl_run(task_id, doc) index_doc: Dict[str, str] = {} # create a non-persisted copy utility_service = UtilityService.copy_from(meter.utility_service) try: with scraper_class(credentials, date_range, configuration) as scraper: scraper.utility_service = utility_service scraper_status = scraper.scrape( readings_handler=readings_handler, bills_handler=bill_handler, pdfs_handler=pdfs_handler, partial_bills_handler=partial_bill_handler, ) if scraper_status == Status.SUCCEEDED: # Avoid muddying Elasticsearch results index_doc = {"status": "SUCCESS"} else: index_doc = {"status": scraper_status.name} if scraper_status in [Status.SUCCEEDED, Status.COMPLETED]: retval = Status.SUCCEEDED else: retval = Status.FAILED # sce-metascraper needs to be able to get the completed status back if scraper.metascraper: retval = scraper_status except Exception as exc: log.exception("Scraper run failed.") retval = Status.FAILED index_doc = { "status": "FAILED", "error": repr(exc), "exception": type(exc).__name__, } # disable the login if scraping threw a LoginError, caller requested disabling on error, # and meter data source has a parent account data source if isinstance(exc, LoginError) and disable_login_on_error and parent: parent.enabled = False db.session.add(parent) log.warning("disabling %s login %s", parent.source_account_type, parent.oid) if notify_on_login_error: alert.disable_logins(parent) index_doc.update( update_utility_service(meter.utility_service, utility_service)) if task_id and config.enabled("ES_INDEX_JOBS"): log.info("Uploading final task status to Elasticsearch.") index.index_etl_run(task_id, index_doc) return retval
def setUp(self) -> None: self.upload_enabled_before = project_config.enabled("S3_BILL_UPLOAD") project_config.FEATURE_FLAGS.add("S3_BILL_UPLOAD")
def _launch_meter_datasource(mds: MeterDataSource, start: date, end: date): if mds is None: log.error("No data source. Aborting.") sys.exit(1) account = None if mds.account_data_source is not None: ads = mds.account_data_source account = ads.account meter = mds.meter scraper_fn = scraper_functions.get(mds.name) if scraper_fn is None: log.error( 'No scraping procedure associated with the identifier "%s". Aborting', mds.name, ) sys.exit(1) parameters = { "data_start": start.strftime("%Y-%m-%d"), "data_end": end.strftime("%Y-%m-%d"), } task_id = os.environ.get("AWS_BATCH_JOB_ID", str(uuid.uuid4())) log.info("Scraper Launch Settings:") log.info("Enabled Features: %s", config.FEATURE_FLAGS) log.info("Meter Data Source OID: %s", mds.oid) log.info("Meter: %s (%s)", meter.name, meter.oid) log.info("Scraper: %s", mds.name) log.info("Date Range: %s - %s", start, end) log.info("Task ID: %s", task_id) log.info( "Elasticsearch Host/Port: %s : %s", config.ELASTICSEARCH_HOST, config.ELASTICSEARCH_PORT, ) log.debug( "Elasticsearch Credentials: %s : %s", config.ELASTICSEARCH_USER, config.ELASTICSEARCH_PASSWORD, ) log.info("Platform Host/Port: %s : %s", config.PLATFORM_HOST, config.PLATFORM_PORT) cleanup_workdir() try: status = scraper_fn(account, meter, mds, parameters, task_id=task_id) # type: ignore[operator] # noqa if config.enabled("S3_ARTIFACT_UPLOAD"): archive_run(task_id) if config.enabled("ES_INDEX_LOGS"): index_logs(task_id) except: # noqa=E722 log.exception( "The scraper run has failed due to an unhandled exception.") status = Status.FAILED db.session.commit() db.session.close() sys.exit(status.value)
def _execute(self): if self.end_date - self.start_date < timedelta(days=MINIMUM_BILL_DAYS): log.info( f"Expanding date range to a minimum of {MINIMUM_BILL_DAYS} days." ) self.start_date = self.end_date - timedelta(days=MINIMUM_BILL_DAYS) start_date = max(self.start_date, (datetime.now() - relativedelta(years=10)).date()) end_date = min(self.end_date, (datetime.now().date())) log.info("Final date range to search: %s - %s" % (start_date, end_date)) login_page = LoginPage(self._driver) home_page = login_page.login(self.username, self.password) self.screenshot("home_screen") log.info("Login successful.") bill_history_page = home_page.to_bill_history() self.screenshot("bill_history_page") log.info("Loaded bill history.") bill_history_page.select_account(self.account_number) self.screenshot("account_selected") log.info("Selected account.") bill_history_page.set_dates(start_date, end_date) self.screenshot("dates_selected") log.info("Selected dates.") raw_pdfs = bill_history_page.gather_data() log.info("PDF bills captured: %s" % len(raw_pdfs)) log.info("Net bill pdf bytes captured: %s" % (sum(len(x) for x in raw_pdfs))) ii = 0 bill_data = [] for b in raw_pdfs: ii += 1 bill_datum = parse_bill_pdf(BytesIO(b), self.meter_number) if bill_datum is None: log.info("There was a problem parsing a bill PDF #%d." % ii) continue attachment_entry = None if config.enabled("S3_BILL_UPLOAD"): key = bill_upload.hash_bill_datum(self.meter_number, bill_datum) attachment_entry = bill_upload.upload_bill_to_s3( BytesIO(b), key, source="pacificpower.net", statement=bill_datum.statement, utility=self.utility, utility_account_id=self.account_number, ) if attachment_entry: bill_data.append( bill_datum._replace(attachments=[attachment_entry])) else: bill_data.append(bill_datum) final_bills = adjust_bill_dates(bill_data) show_bill_summary(final_bills, "Final Bill Summary") return Results(bills=final_bills)