Python enabled Exemples, datafeeds.config.enabled Python Exemples

Exemple #1

0

Afficher le fichier

def upload_readings(transforms, meter_oid: int, scraper: str, task_id: str,
                    readings) -> Status:
    updated: List[MeterReading] = []
    if readings:
        readings = interval_transform.transform(transforms, task_id, scraper,
                                                meter_oid, readings)
        log.info("writing interval data to the database for %s %s", scraper,
                 meter_oid)
        updated = MeterReading.merge_readings(
            MeterReading.from_json(meter_oid, readings))

    if task_id and config.enabled("ES_INDEX_JOBS"):
        index.set_interval_fields(task_id, updated)

    log.info("Final Interval Summary")
    for when, intervals in readings.items():
        none_count = sum(1 for x in intervals if x is None)
        factor = (24 / len(intervals)) if len(intervals) > 0 else 1.0
        kWh = sum(x for x in intervals if x is not None) * factor
        log.info("%s: %d intervals. %.1f net kWh, %d null values." %
                 (when, len(intervals), kWh, none_count))

    path = os.path.join(config.WORKING_DIRECTORY, "readings.csv")
    with open(path, "w") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Service", "Date", "Readings"])
        for when, intervals in readings.items():
            writer.writerow([meter_oid, str(when)] +
                            [str(x) for x in intervals])
    log.info("Wrote interval data to %s." % path)

    if updated:
        return Status.SUCCEEDED
    return Status.COMPLETED

Exemple #2

0

Afficher le fichier

Fichier : scraper.py Projet : gnoose/datafeeds-shared

def make_attachments(
    source_urls: List[str],
    statement: date,
    utility: Optional[str] = None,
    account_id: Optional[str] = None,
    gen_utility: Optional[str] = None,
    gen_utility_account_id: Optional[str] = None,
):
    if not config.enabled("S3_BILL_UPLOAD"):
        return None

    if not source_urls:
        return None

    s3_keys = [statement_to_s3(url) for url in source_urls]
    attachments = [
        AttachmentEntry(
            key=key,
            kind="bill",
            format="PDF",
            source="urjanet",
            statement=statement.strftime("%Y-%m-%d"),
            utility=utility,
            utility_account_id=account_id,
            gen_utility=gen_utility,
            gen_utility_account_id=gen_utility_account_id,
        ) for key in s3_keys if key is not None
    ]
    if attachments:
        return attachments

    return None

Exemple #3

0

Afficher le fichier

def upload_file_to_s3(body,
                      bucket,
                      key,
                      file_display_name=None,
                      content_type=None):
    """Upload a file to s3

    Stores a PDF into a S3 bucket, under the specified key, if that
    key doesn't already exist in the bucket.

    Args:
        body: The contents of the pdf file. Should be a binary file-like
            object (e.g. the result of opening a file in binary mode).
        bucket: The name of the bucket to upload the file into.
        key: The key under which to store the file
        file_display_name: The "original" filename, placed in the
            "content-disposition" metadata of the upload. This
            argument is optional, and if not specified defaults to
            the key name.
        content_type: set as ContentType metadata for the file; defaults to application/pdf

    Returns:
        The name of the key where the file is stored. Should be equal
        to the "key" argument.
    """
    log.debug(
        "S3 Upload Requested: key=%s, bucket=%s, display_name=%s",
        key,
        bucket,
        file_display_name,
    )

    if not config.enabled("S3_BILL_UPLOAD"):
        log.debug("Bill upload disabled, skipping S3 upload.")
        return None

    # see if already fetched/uploaded
    if s3_key_exists(bucket, key):
        log.debug("Key %s already exists in bucket %s.", key, bucket)
        return key

    if file_display_name is None:
        file_display_name = key

    client = boto3.client("s3")
    resp = client.put_object(
        Body=body,
        Bucket=bucket,
        ContentDisposition="inline; filename=%s" % file_display_name,
        ContentType=content_type,
        Key=key,
        StorageClass="STANDARD_IA",
    )

    log.debug("Attempted S3 upload to %s %s: %s", bucket, key, resp)

    return key

Exemple #4

0

Afficher le fichier

def remove_file_from_s3(bucket: str, key: str) -> None:
    if not config.enabled("S3_BILL_UPLOAD"):
        log.debug("Bill upload disabled, skipping S3 remove.")
        return
    client = boto3.client("s3")
    try:
        client.delete_object(Bucket=bucket, Key=key)
    except:  # noqa: E722
        log.exception("Request to remove file %s/%s from S3 failed.", bucket,
                      key)

Exemple #5

0

Afficher le fichier

def s3_key_exists(bucket, key):
    """Determine if a key exists in an S3 bucket."""

    if not config.enabled("S3_BILL_UPLOAD"):
        return False

    client = boto3.client("s3")
    try:
        client.head_object(Bucket=bucket, Key=key)
        return True
    except:  # noqa: E722
        return False

Exemple #6

0

Afficher le fichier

Fichier : synchronizer.py Projet : gnoose/datafeeds-shared

    def _execute(self):
        config: SmdPartialBillingScraperConfiguration = self._configuration
        meter = config.meter

        usage_points = relevant_usage_points(meter)
        log.info("Identified %s relevant usage point(s): %s",
                 len(usage_points), usage_points)
        query = db.session.query(SmdBill).filter(
            SmdBill.usage_point.in_(usage_points))

        if self.start_date:
            start = self.start_date
            end = max(start, self.end_date or date.today())
            if end - self.start_date <= timedelta(days=60):
                start = start - timedelta(days=60)
                log.info("Adjusting start date to %s.", start)
            query = query.filter(start <= SmdBill.start)

        if self.end_date:
            query = query.filter(SmdBill.start <= self.end_date)

        query = query.order_by(SmdBill.published)

        log.info("Identified %d raw SMD bills relevant to this meter.",
                 query.count())
        # It often happens that we receive several versions of the same bill across multiple files.
        # The first thing we need to do is order the bills by publication date, so we can decide
        # which SmdBill record is the correct one for our chosen date.
        unified_bills: List[SmdBill] = SmdBill.unify_bills(query)
        adjusted_bills: List[SmdBill] = SmdBill.adjust_single_day_bills(
            unified_bills)
        partial_bills = [
            b.to_billing_datum(self.service) for b in adjusted_bills
        ]

        if partial_bills:
            log.debug(
                "Identified %s partial bills in Share My Data for meter %s (%s).",
                len(partial_bills),
                meter.name,
                meter.oid,
            )
            if datafeeds_config.enabled("S3_BILL_UPLOAD"):
                partial_bills = self.attach_corresponding_urja_pdfs(
                    partial_bills)

        return Results(tnd_bills=partial_bills)

Exemple #7

0

Afficher le fichier

def upload_partial_bills(
    meter: Meter,
    task_id: str,
    billing_data: BillingData,
    bill_type: PartialBillProviderType,
) -> Status:
    """
    Goes through billing_data and uploads new partial bills directly to the partial bills table.
    If a new partial bill differs from an existing partial bill,
    a new bill is created, rather than overwriting the old one.

    New partial bills are written directly to the db; they do not go through platform.
    """
    log.info("Starting processing of partial bill scraper results.")
    processor = PartialBillProcessor(meter, bill_type, billing_data)
    status = processor.process_partial_bills()
    processor.log_summary()
    if task_id and config.enabled("ES_INDEX_JOBS"):
        log.info("Updating billing range in Elasticsearch.")
        index.update_billing_range(task_id, billing_data)

    return status

Exemple #8

0

Afficher le fichier

def upload_bills(
    meter_oid: int,
    service_id: str,
    task_id: str,
    scraper: str,
    billing_data: BillingData,
) -> Status:
    cur_most_recent = _latest_closing(service_id)

    _, change_records = _upload_bills_to_services(service_id, billing_data)

    if task_id and config.enabled("ES_INDEX_JOBS"):
        log.info("Updating billing range in Elasticsearch.")
        index.update_billing_range(task_id, billing_data)
        if change_records:
            index.index_bill_records(scraper, change_records)
    billing_data = verify_bills(meter_oid, billing_data)

    title = "Final Scraped Summary"
    show_bill_summary(billing_data, title)

    path = os.path.join(config.WORKING_DIRECTORY, "bills.csv")
    end = date(year=1900, month=1, day=1)
    with open(path, "w") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Service ID", "Start", "End", "Cost", "Used", "Peak"])
        for b in billing_data:
            writer.writerow(
                [service_id, b.start, b.end, b.cost, b.used, b.peak])
            if type(b.end) == datetime:
                end = max(b.end.date(), end)  # type: ignore
            else:
                if b.end > end:
                    end = b.end
    log.info("Wrote bill data to %s." % path)
    if cur_most_recent and (end > cur_most_recent):
        return Status.SUCCEEDED
    return Status.COMPLETED

Exemple #9

0

Afficher le fichier

    def get_bills(self, account_id: str, start: date,
                  end: date) -> List[BillingDatum]:
        """Get bills from the table.

        for each row:
          get end from Read date column (date)
          get start date from end date - (Days column (date) - 1)
          get statement date from Bill date column (date)
          if not start - end overlaps passed in start / end, continue
          get peak from On-peak Billed kW (float)
          get used from (Off-peak kWh + Shoulder kWh + On-peak kWh) (float)
          get cost from New charges (float)
          click eye icon to download PDF; wait for download to complete to self.driver.download_dir
        """
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(self.UsageTableBodyLocator))
        usage_table_rows = self.driver.find_elements(
            *self.UsageTableRowsLocator)

        bill_data: List[BillingDatum] = []
        self.driver.screenshot(BaseWebScraper.screenshot_path("bill table"))
        for row in usage_table_rows:
            cols = row.find_elements_by_tag_name("td")
            cols = [
                c for c in cols
                if "display: none" not in c.get_attribute("style")
            ]

            col = lambda x: cols[x].text
            to_num = lambda x: "".join(d for d in col(x)
                                       if d.isdigit() or d == ".")
            to_float = lambda x: float(to_num(x)) if len(to_num(x)) > 0 else 0

            log.debug(f"statement={col(1)} end={col(2)} days={col(7)}")
            # statement date
            statement_date = date_parser.parse(col(1)).date()

            # bill end
            period_year = statement_date.year
            if statement_date.month == 1 and col(2).startswith("12"):
                period_year = statement_date.year - 1
            end_str = f"{col(2)}/{period_year}"
            bill_end = date_parser.parse(end_str).date()

            # bill start
            bill_start = bill_end - timedelta(days=int(to_float(7)) - 1)
            log.debug(f"start={bill_start} end={bill_end}")

            if not self._overlap(start, end, bill_start, bill_end):
                log.info(
                    f"skipping bill {bill_start} - {bill_end}: does not overlap requested range {start} - {end}"
                )
                continue

            # cost
            new_charges = to_float(8)
            # used
            used = to_float(4) + to_float(5) + to_float(6)
            # peak
            peak = to_float(3)

            bill_datum = BillingDatum(
                start=bill_start,
                end=bill_end,
                statement=statement_date,
                cost=new_charges,
                used=used,
                peak=peak,
                items=None,
                attachments=None,
                utility_code=None,
            )

            try:
                bill_pdf_name = "SRPbill{}{}.pdf".format(
                    statement_date.strftime("%B"), statement_date.year)
                pdf_download_link = cols[0].find_element_by_tag_name("a")
                scroll_to(self.driver, pdf_download_link)
                pdf_download_link.click()
                log.info("looking for %s in %s", bill_pdf_name,
                         self.driver.download_dir)
                self.driver.wait(60).until(
                    file_exists_in_dir(self.driver.download_dir,
                                       bill_pdf_name))
            except Exception as e:
                raise Exception(
                    f"Failed to download bill {bill_pdf_name} for statement date {statement_date}:\n {e}"
                )
            log.info(
                f"Bill {bill_pdf_name} for statement date {statement_date} downloaded successfully"
            )

            attachment_entry = None
            # open downloaded PDF and upload
            if config.enabled("S3_BILL_UPLOAD"):
                key = hash_bill_datum(account_id, bill_datum)
                with open(f"{self.driver.download_dir}/{bill_pdf_name}",
                          "rb") as pdf_data:
                    attachment_entry = upload_bill_to_s3(
                        BytesIO(pdf_data.read()),
                        key,
                        source="myaccount.srpnet.com",
                        statement=bill_datum.statement,
                        utility="utility:salt-river-project",
                        utility_account_id=account_id,
                    )
            if attachment_entry:
                bill_data.append(
                    bill_datum._replace(attachments=[attachment_entry]))
            else:
                bill_data.append(bill_datum)
        return bill_data

Exemple #10

0

Afficher le fichier

def attach_bill_pdfs(
    meter_oid: int,
    task_id: str,
    meter_only: bool,
    pdfs: List[BillPdf],
) -> Status:
    """Attach a list of bill PDF files uploaded to S3 to bill records."""
    if not pdfs:
        return Status.COMPLETED

    count = 0
    unused = []
    for pdf in pdfs:
        log.info(
            "bill PDF for utility_account_id=%s statement=%s",
            pdf.utility_account_id,
            pdf.statement,
        )

        if meter_only:
            # Attach PDFs to bills on this meter's service only.  Matches up PDF start dates with PartialBill/
            # Bill start dates with a small buffer.
            bill_query = db.session.query(Bill).filter(
                Bill.service == Meter.service,
                Meter.oid == meter_oid,
                Bill.initial >= pdf.start - timedelta(days=1),
                Bill.initial <= pdf.start + timedelta(days=1),
            )
            partial_bill_query = db.session.query(PartialBill).filter(
                PartialBill.service == Meter.service,
                Meter.oid == meter_oid,
                PartialBill.initial >= pdf.start - timedelta(days=1),
                PartialBill.initial <= pdf.start + timedelta(days=1),
                PartialBill.superseded_by.is_(None),
                PartialBill.visible.is_(True),
                PartialBill.provider_type ==
                PartialBillProviderType.TND_ONLY.value,
            )
            if not bill_query.count() and not partial_bill_query.count():
                log.warning(
                    "no bills found for utility_account_id %s %s-%s",
                    pdf.utility_account_id,
                    pdf.start,
                    pdf.end,
                )
            bill_attach_status = add_attachment_to_bills(pdf, bill_query)
            partial_attach_status = add_attachment_to_bills(
                pdf, partial_bill_query)
            if (AttachStatus.best([bill_attach_status, partial_attach_status
                                   ]) == AttachStatus.ATTACHED):
                # Only increase count if attachments were updated.
                # Not adding any PDF's to "unused" because attachment could be in use on another meter.
                count += 1

        else:
            # Attach PDF's to potentially multiple bills on multiple services with the same utility account id.
            # Attach PDFs to bills on account that ended recently before the statement date:
            query = (db.session.query(Bill).filter(
                UtilityService.utility_account_id == pdf.utility_account_id
            ).filter(UtilityService.oid == Bill.service).filter(
                Bill.closing > pdf.statement - timedelta(days=14)).filter(
                    Bill.closing <= pdf.statement))
            bill_count = query.count()
            if not bill_count:
                log.warning(
                    "no bills found for utility_account_id %s %s-%s",
                    pdf.utility_account_id,
                    pdf.start,
                    pdf.end,
                )
            attached = add_attachment_to_bills(pdf, query)
            if attached == AttachStatus.ATTACHED:
                count += 1
            elif attached == AttachStatus.NOT_ATTACHED:
                unused.append(pdf.s3_key)
    log.info("attached %s/%s pdfs", count, len(pdfs))
    for key in unused:
        remove_file_from_s3(config.BILL_PDF_S3_BUCKET, key)
    if task_id and config.enabled("ES_INDEX_JOBS"):
        log.info("Updating billing range in Elasticsearch.")
        index.update_bill_pdf_range(task_id, meter_oid, pdfs)

    if count:
        return Status.SUCCEEDED
    return Status.COMPLETED

Exemple #11

0

Afficher le fichier

Fichier : portland_bizportal.py Projet : gnoose/datafeeds-shared

def extract_bill_data(pdf_filename, service_id, utility,
                      utility_account_id) -> Optional[BillingDatum]:
    # this function should upload the file to s3 to set attachments?
    try:
        text = pdf_to_str(pdf_filename)
    except PDFSyntaxError:
        log.exception("Downloaded bill file failed to parse as a PDF.")
        return None

    current_charges_pattern = "Current Charges(.*?)Cycle"
    for line in (re.search(current_charges_pattern, text,
                           re.DOTALL).group(1).split("\n")):
        # get the last number
        if re.match(r"[\d,\.]", line.strip()):
            current_charges = line.strip().replace(",", "")

    period_start, period_end = extract_bill_period(pdf_filename)

    usage_pattern = r"Energy Charges \((\d*) kWh\)"
    usage = re.search(usage_pattern, text).groups()[0]

    on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW"
    on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0]

    offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW"
    offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0]

    bill_attachment = []
    if config.enabled("S3_BILL_UPLOAD"):
        log.info("S3_BILL_UPLOAD is enabled")
        with open(pdf_filename, "rb") as f:
            key = hash_bill(
                service_id,
                period_start,
                period_end,
                _format_number(current_charges),
                0,
                _format_number(usage),
            )
            # no statement date; use end date
            bill_attachment.append(
                upload_bill_to_s3(
                    f,
                    key,
                    source="portlandgeneral.com",
                    statement=period_end,
                    utility=utility,
                    utility_account_id=utility_account_id,
                ))
            log.info("Uploaded bill %s to s3", bill_attachment)

    bill = BillingDatum(
        start=period_start,
        end=period_end,
        statement=period_end,
        cost=_format_number(current_charges),
        used=_format_number(usage),
        peak=max(
            float(on_peak_demand),
            float(offpeak_demand),
        ),
        items=[],
        attachments=bill_attachment,
        utility_code=None,
    )

    return bill

Exemple #12

0

Afficher le fichier

def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum:
    text = pdfparser.pdf_to_str(pdf_filename)

    used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @"
    cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @"

    # date format: m/d/yyyy
    date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}"
    dates_pattern = (
        r"Total Current Charges.+?"
        fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})"
        fr"(?P<due_date>{date_pattern})"
        fr"(?P<statement_date>{date_pattern})")

    dates_match = re.search(dates_pattern, text)
    if not dates_match:
        raise InvalidMeterDataException(
            f"Couldn't parse dates from pdf: {text}")

    _dates = dates_match.group("read_date_start", "read_date_end",
                               "statement_date")
    start_date, end_date, statement_date = [
        parse_date(_date).date() for _date in _dates
    ]

    used_match = re.search(used_pattern, text)
    if not used_match:
        raise InvalidMeterDataException(
            "fCouldn't parse usage from pdf: {text}")

    used_text = used_match.group("units_used")
    used = float(used_text.replace(",", "").replace("$", ""))

    cost_match = re.search(cost_pattern, text)
    if not cost_match:
        raise InvalidMeterDataException(
            f"Couldn't parse cost from pdf: {text}")

    cost_text = cost_match.group("water_charges")
    cost = float(cost_text.replace(",", "").replace("$", ""))

    if config.enabled("S3_BILL_UPLOAD"):
        key = hash_bill(account_id, start_date, end_date, cost, 0, used)
        with open(pdf_filename, "rb") as pdf_data:
            attachments = [
                upload_bill_to_s3(
                    BytesIO(pdf_data.read()),
                    key,
                    source="customerconnect.poway.org",
                    statement=statement_date,
                    utility="utility:city-of-poway",
                    utility_account_id=account_id,
                )
            ]
    else:
        attachments = []
    return BillingDatum(
        start=start_date,
        end=end_date - timedelta(days=1),
        statement=statement_date,
        cost=cost,
        peak=None,
        used=used,
        items=None,
        attachments=attachments,
        utility_code=None,
    )

Exemple #13

0

Afficher le fichier

def run_datafeed(
    scraper_class,
    account: SnapmeterAccount,
    meter: Meter,
    datasource: MeterDataSource,
    params: dict,
    configuration=None,
    task_id=None,
    transforms: Optional[List[Transforms]] = None,
    disable_login_on_error: Optional[bool] = False,
    notify_on_login_error: Optional[bool] = True,
    meter_only: Optional[bool] = False,
) -> Status:
    transforms = [] if transforms is None else transforms
    bill_handler = ft.partial(
        upload_bills,
        meter.oid,
        meter.utility_service.service_id,
        task_id,
        datasource.name,
    )
    readings_handler = ft.partial(upload_readings, transforms, meter.oid,
                                  datasource.name, task_id)
    pdfs_handler = ft.partial(attach_bill_pdfs, meter.oid, task_id, meter_only)
    partial_bill_handler = ft.partial(upload_partial_bills, meter, task_id)

    date_range = DateRange(
        *iso_to_dates(params.get("data_start"), params.get("data_end")))

    parent: AccountDataSource = None
    if datasource.account_data_source:
        parent = datasource.account_data_source
        credentials = Credentials(parent.username, parent.password)
        if not datasource.account_data_source.enabled:
            raise DataSourceConfigurationError(
                "%s scraper for %s is disabled" %
                (datasource.account_data_source.name, meter.oid))
    else:
        credentials = Credentials(None, None)

    if task_id and config.enabled("ES_INDEX_JOBS"):
        log.info("Uploading task information to Elasticsearch.")
        doc = index.starter_doc(meter.oid, datasource)
        doc["start_date"] = date_range.start_date
        doc["end_date"] = date_range.end_date
        doc["meter_data_source"] = datasource.oid
        if configuration:
            doc.update({
                "billScraper":
                configuration.scrape_bills
                or configuration.scrape_partial_bills,
                "intervalScraper":
                configuration.scrape_readings,
            })
        index.index_etl_run(task_id, doc)

    index_doc: Dict[str, str] = {}
    # create a non-persisted copy
    utility_service = UtilityService.copy_from(meter.utility_service)
    try:
        with scraper_class(credentials, date_range, configuration) as scraper:
            scraper.utility_service = utility_service
            scraper_status = scraper.scrape(
                readings_handler=readings_handler,
                bills_handler=bill_handler,
                pdfs_handler=pdfs_handler,
                partial_bills_handler=partial_bill_handler,
            )
            if scraper_status == Status.SUCCEEDED:
                # Avoid muddying Elasticsearch results
                index_doc = {"status": "SUCCESS"}
            else:
                index_doc = {"status": scraper_status.name}
            if scraper_status in [Status.SUCCEEDED, Status.COMPLETED]:
                retval = Status.SUCCEEDED
            else:
                retval = Status.FAILED
            # sce-metascraper needs to be able to get the completed status back
            if scraper.metascraper:
                retval = scraper_status

    except Exception as exc:
        log.exception("Scraper run failed.")
        retval = Status.FAILED
        index_doc = {
            "status": "FAILED",
            "error": repr(exc),
            "exception": type(exc).__name__,
        }
        # disable the login if scraping threw a LoginError, caller requested disabling on error,
        # and meter data source has a parent account data source
        if isinstance(exc, LoginError) and disable_login_on_error and parent:
            parent.enabled = False
            db.session.add(parent)
            log.warning("disabling %s login %s", parent.source_account_type,
                        parent.oid)
            if notify_on_login_error:
                alert.disable_logins(parent)

    index_doc.update(
        update_utility_service(meter.utility_service, utility_service))
    if task_id and config.enabled("ES_INDEX_JOBS"):
        log.info("Uploading final task status to Elasticsearch.")
        index.index_etl_run(task_id, index_doc)

    return retval

Exemple #14

0

Afficher le fichier

 def setUp(self) -> None:
     self.upload_enabled_before = project_config.enabled("S3_BILL_UPLOAD")
     project_config.FEATURE_FLAGS.add("S3_BILL_UPLOAD")

Exemple #15

0

Afficher le fichier

Fichier : launch.py Projet : gnoose/datafeeds-shared

def _launch_meter_datasource(mds: MeterDataSource, start: date, end: date):
    if mds is None:
        log.error("No data source. Aborting.")
        sys.exit(1)

    account = None
    if mds.account_data_source is not None:
        ads = mds.account_data_source
        account = ads.account

    meter = mds.meter

    scraper_fn = scraper_functions.get(mds.name)

    if scraper_fn is None:
        log.error(
            'No scraping procedure associated with the identifier "%s". Aborting',
            mds.name,
        )
        sys.exit(1)

    parameters = {
        "data_start": start.strftime("%Y-%m-%d"),
        "data_end": end.strftime("%Y-%m-%d"),
    }

    task_id = os.environ.get("AWS_BATCH_JOB_ID", str(uuid.uuid4()))

    log.info("Scraper Launch Settings:")
    log.info("Enabled Features: %s", config.FEATURE_FLAGS)
    log.info("Meter Data Source OID: %s", mds.oid)
    log.info("Meter: %s (%s)", meter.name, meter.oid)
    log.info("Scraper: %s", mds.name)
    log.info("Date Range: %s - %s", start, end)
    log.info("Task ID: %s", task_id)
    log.info(
        "Elasticsearch Host/Port: %s : %s",
        config.ELASTICSEARCH_HOST,
        config.ELASTICSEARCH_PORT,
    )
    log.debug(
        "Elasticsearch Credentials: %s : %s",
        config.ELASTICSEARCH_USER,
        config.ELASTICSEARCH_PASSWORD,
    )
    log.info("Platform Host/Port: %s : %s", config.PLATFORM_HOST,
             config.PLATFORM_PORT)

    cleanup_workdir()
    try:
        status = scraper_fn(account, meter, mds, parameters,
                            task_id=task_id)  # type: ignore[operator] # noqa

        if config.enabled("S3_ARTIFACT_UPLOAD"):
            archive_run(task_id)
        if config.enabled("ES_INDEX_LOGS"):
            index_logs(task_id)
    except:  # noqa=E722
        log.exception(
            "The scraper run has failed due to an unhandled exception.")
        status = Status.FAILED

    db.session.commit()
    db.session.close()
    sys.exit(status.value)

Exemple #16

0

Afficher le fichier

Fichier : pacific_power_billing.py Projet : gnoose/datafeeds-shared

    def _execute(self):
        if self.end_date - self.start_date < timedelta(days=MINIMUM_BILL_DAYS):
            log.info(
                f"Expanding date range to a minimum of {MINIMUM_BILL_DAYS} days."
            )
            self.start_date = self.end_date - timedelta(days=MINIMUM_BILL_DAYS)

        start_date = max(self.start_date,
                         (datetime.now() - relativedelta(years=10)).date())
        end_date = min(self.end_date, (datetime.now().date()))

        log.info("Final date range to search: %s - %s" %
                 (start_date, end_date))

        login_page = LoginPage(self._driver)
        home_page = login_page.login(self.username, self.password)
        self.screenshot("home_screen")
        log.info("Login successful.")

        bill_history_page = home_page.to_bill_history()
        self.screenshot("bill_history_page")
        log.info("Loaded bill history.")

        bill_history_page.select_account(self.account_number)
        self.screenshot("account_selected")
        log.info("Selected account.")

        bill_history_page.set_dates(start_date, end_date)
        self.screenshot("dates_selected")
        log.info("Selected dates.")

        raw_pdfs = bill_history_page.gather_data()

        log.info("PDF bills captured: %s" % len(raw_pdfs))
        log.info("Net bill pdf bytes captured: %s" %
                 (sum(len(x) for x in raw_pdfs)))

        ii = 0
        bill_data = []
        for b in raw_pdfs:
            ii += 1
            bill_datum = parse_bill_pdf(BytesIO(b), self.meter_number)

            if bill_datum is None:
                log.info("There was a problem parsing a bill PDF #%d." % ii)
                continue

            attachment_entry = None
            if config.enabled("S3_BILL_UPLOAD"):
                key = bill_upload.hash_bill_datum(self.meter_number,
                                                  bill_datum)
                attachment_entry = bill_upload.upload_bill_to_s3(
                    BytesIO(b),
                    key,
                    source="pacificpower.net",
                    statement=bill_datum.statement,
                    utility=self.utility,
                    utility_account_id=self.account_number,
                )

            if attachment_entry:
                bill_data.append(
                    bill_datum._replace(attachments=[attachment_entry]))
            else:
                bill_data.append(bill_datum)

        final_bills = adjust_bill_dates(bill_data)
        show_bill_summary(final_bills, "Final Bill Summary")
        return Results(bills=final_bills)