Python pdf_to_str Examples, datafeeds.parsers.pdfparser.pdf_to_str Python Examples

Example #1

0

Show file

File: test_pdfparser.py Project: gnoose/datafeeds-shared

 def test_pdf_to_str(self):
     output = pdfparser.pdf_to_str(
         os.path.join(TEST_DIR, "fixtures",
                      "test_portland_bizportal_bill.pdf"))
     self.assertEqual(type(output), str)
     self.assertGreater(
         len(output), 100)  # The string for this pdf should be quite long.
     self.assertIn("Service Period", output)  # Some text from this PDF

Example #2

0

Show file

 def test_parse_old_pdf3(self):
     """Verify that we can extract cost, use, and demand from another old-format PDF."""
     text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_3.pdf")
     data = pdf_parser.parse_old_pdf(text)
     self.assertEqual(date(2020, 3, 20), data.start)
     self.assertEqual(date(2020, 4, 20), data.end)
     self.assertAlmostEqual(29.89, data.cost, 2)
     self.assertAlmostEqual(155, data.used, 2)
     self.assertIsNone(data.peak)

Example #3

0

Show file

 def test_parse_old_pdf4(self):
     """Verify that we can extract dates from an old-format PDF over a year boundary."""
     text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_4.pdf")
     data = pdf_parser.parse_old_pdf(text)
     self.assertEqual(date(2019, 11, 27), data.start)
     self.assertEqual(date(2019, 12, 30), data.end)
     self.assertAlmostEqual(19.7, data.cost, 2)
     self.assertAlmostEqual(156, data.used, 2)
     self.assertIsNone(data.peak)

Example #4

0

Show file

 def test_parse_old_pdf2(self):
     """Verify that we can extract cost, use, and demand from another old-format PDF."""
     text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_2.pdf")
     data = pdf_parser.parse_old_pdf(text)
     self.assertEqual(date(2018, 10, 29), data.start)
     self.assertEqual(date(2018, 12, 3), data.end)
     self.assertAlmostEqual(685.16, data.cost, 2)
     self.assertAlmostEqual(5581.0, data.used, 2)
     self.assertAlmostEqual(30.0, data.peak, 2)

Example #5

0

Show file

 def test_parse_new_pdf3(self):
     """Verify that we can extract cost, use, and demand from another new-format PDF."""
     text = pdfparser.pdf_to_str("private_fixtures/duke_bill_new_3.pdf")
     data = pdf_parser.parse_new_pdf(text)
     self.assertEqual(date(2020, 4, 21), data.start)
     self.assertEqual(date(2020, 5, 23), data.end)
     self.assertAlmostEqual(3048.76, data.cost, 2)
     self.assertAlmostEqual(38320, data.used, 2)
     self.assertAlmostEqual(86, data.peak)

Example #6

0

Show file

 def test_parse_new_pdf1(self):
     """Verify that we can extract cost, use, and demand from June 2020+ version of PDF."""
     text = pdfparser.pdf_to_str("private_fixtures/duke_bill_new_1.pdf")
     data = pdf_parser.parse_new_pdf(text)
     self.assertEqual(date(2020, 5, 10), data.start)
     self.assertEqual(date(2020, 6, 9), data.end)
     self.assertAlmostEqual(123113.66, data.cost, 2)
     self.assertAlmostEqual(1806000.0, data.used, 2)
     self.assertAlmostEqual(3840.0, data.peak, 2)

Example #7

0

Show file

def process_pdf(
    utility: str,
    utility_account_id: str,
    service_id: str,
    statement_dt: date,
    pdf_filename: str,
) -> BillingDatum:
    log.info("Parsing text from PDF %s", pdf_filename)
    text = pdfparser.pdf_to_str(pdf_filename)

    cost = extract_cost(text)
    used = extract_used(text)
    demand = extract_demand(text)
    start_date, end_date = extract_dates(text)

    # if the start date is in the wrong year, replace year (start_date = 12/1, statement_dt=12/15/2020)
    if start_date > statement_dt:
        start_date = start_date.replace(year=statement_dt.year)
        end_date = end_date.replace(year=statement_dt.year)
    # end_date must be after start date (end_date = 1/5, start_date = 12/1)
    if end_date < start_date:
        end_date = end_date.replace(year=end_date.year + 1)

    # adjust end date because SVP bills overlap on start/end dates
    end_date = end_date - timedelta(days=1)
    line_items: List[BillingDatumItemsEntry] = extract_line_items(text)
    key = hash_bill(
        service_id,
        start_date,
        end_date,
        cost,
        demand,
        used,
    )
    with open(pdf_filename, "rb") as pdf_data:
        attachment_entry = upload_bill_to_s3(
            BytesIO(pdf_data.read()),
            key,
            source="mua.santaclaraca.gov",
            statement=end_date,
            utility=utility,
            utility_account_id=utility_account_id,
        )

    return BillingDatum(
        start=start_date,
        end=end_date,
        statement=statement_dt,
        cost=cost,
        used=used,
        peak=demand,
        items=line_items,
        attachments=[attachment_entry],
        utility_code=None,
    )

Example #8

0

Show file

    def test_parse_old_pdf1(self):
        """Verify that we can extract cost, use, and demand from version of PDF prior to June 2020.

        This bill includes a prior balance due line item that should be excluded from bill total.
        """
        text = pdfparser.pdf_to_str("private_fixtures/duke_bill_old_1.pdf")
        data = pdf_parser.parse_old_pdf(text)
        self.assertEqual(date(2020, 4, 10), data.start)
        self.assertEqual(date(2020, 5, 9), data.end)
        self.assertAlmostEqual(89972.41, data.cost, 2)
        self.assertAlmostEqual(1312000.0, data.used, 2)
        self.assertAlmostEqual(3000.0, data.peak, 2)

Example #9

0

Show file

File: portland_bizportal.py Project: gnoose/datafeeds-shared

def extract_bill_period(pdf_filename):
    """Convert the PDF to a string so we can determine the dates this bill covers."""
    try:
        text = pdf_to_str(pdf_filename)
    except PDFSyntaxError:
        log.exception("Downloaded bill file failed to parse as a PDF.")
        return None, None

    pattern = r"Service Period\n(\d+/\d+/\d+)\n(\d+/\d+/\d+)"
    match = re.search(pattern, text)

    if match:
        period_a = parse_time(match.group(1)).date()
        period_b = parse_time(match.group(2)).date()
        return min(period_a, period_b), max(period_a, period_b)

    return None, None

Example #10

0

Show file

File: pdf_parser.py Project: gnoose/datafeeds-shared

def parse_pdf(pdf_filename: str, utility: str,
              utility_account_id: str) -> BillingDatum:
    text = pdfparser.pdf_to_str(pdf_filename)
    if "Your Energy Bill" in text:
        log.info("parsing new-style PDF %s", pdf_filename)
        data = parse_new_pdf(text)
    else:
        log.info("parsing old-style PDF %s", pdf_filename)
        data = parse_old_pdf(text)
    key = hash_bill(utility_account_id, data.start, data.end, data.cost,
                    data.peak, data.used)
    with open(pdf_filename, "rb") as pdf_data:
        attachment_entry = upload_bill_to_s3(
            BytesIO(pdf_data.read()),
            key,
            source="www.duke-energy.com",
            statement=data.end,
            utility=utility,
            utility_account_id=utility_account_id,
        )
    return data._replace(attachments=[attachment_entry])

Example #11

0

Show file

File: test_svp_parser.py Project: gnoose/datafeeds-shared

 def test_additional_line_items(self):
     text = pdfparser.pdf_to_str("private_fixtures/svp_bill_3.pdf")
     data = pdf_parser.extract_line_items(text)
     self.assertEqual(data[0].total, 46.34)

Example #12

0

Show file

File: test_svp_parser.py Project: gnoose/datafeeds-shared

 def test_no_power_factor_charge(self):
     """The parser handles a bill without a Power Factor Charge line"""
     text = pdfparser.pdf_to_str("private_fixtures/svp_bill_2.pdf")
     data = pdf_parser.extract_line_items(text)
     self.assertEqual(data[3][3], None)

Example #13

0

Show file

File: portland_bizportal.py Project: gnoose/datafeeds-shared

def extract_bill_data(pdf_filename, service_id, utility,
                      utility_account_id) -> Optional[BillingDatum]:
    # this function should upload the file to s3 to set attachments?
    try:
        text = pdf_to_str(pdf_filename)
    except PDFSyntaxError:
        log.exception("Downloaded bill file failed to parse as a PDF.")
        return None

    current_charges_pattern = "Current Charges(.*?)Cycle"
    for line in (re.search(current_charges_pattern, text,
                           re.DOTALL).group(1).split("\n")):
        # get the last number
        if re.match(r"[\d,\.]", line.strip()):
            current_charges = line.strip().replace(",", "")

    period_start, period_end = extract_bill_period(pdf_filename)

    usage_pattern = r"Energy Charges \((\d*) kWh\)"
    usage = re.search(usage_pattern, text).groups()[0]

    on_peak_demand_pattern = r"On-Peak Demand \((\d+\.\d+)\ KW"
    on_peak_demand = re.search(on_peak_demand_pattern, text).groups()[0]

    offpeak_demand_pattern = r"Off-Peak Demand \((\d+\.\d+)\ KW"
    offpeak_demand = re.search(offpeak_demand_pattern, text).groups()[0]

    bill_attachment = []
    if config.enabled("S3_BILL_UPLOAD"):
        log.info("S3_BILL_UPLOAD is enabled")
        with open(pdf_filename, "rb") as f:
            key = hash_bill(
                service_id,
                period_start,
                period_end,
                _format_number(current_charges),
                0,
                _format_number(usage),
            )
            # no statement date; use end date
            bill_attachment.append(
                upload_bill_to_s3(
                    f,
                    key,
                    source="portlandgeneral.com",
                    statement=period_end,
                    utility=utility,
                    utility_account_id=utility_account_id,
                ))
            log.info("Uploaded bill %s to s3", bill_attachment)

    bill = BillingDatum(
        start=period_start,
        end=period_end,
        statement=period_end,
        cost=_format_number(current_charges),
        used=_format_number(usage),
        peak=max(
            float(on_peak_demand),
            float(offpeak_demand),
        ),
        items=[],
        attachments=bill_attachment,
        utility_code=None,
    )

    return bill

Example #14

0

Show file

def parse_poway_pdf(pdf_filename: str, account_id: str) -> BillingDatum:
    text = pdfparser.pdf_to_str(pdf_filename)

    used_pattern = r"Consumption (?P<units_used>[\d\.,]+) @"
    cost_pattern = r"(?P<water_charges>[\d\.,]+)\s+WATERBasic Service @"

    # date format: m/d/yyyy
    date_pattern = r"\d{1,2}\/\d{1,2}\/\d{4}"
    dates_pattern = (
        r"Total Current Charges.+?"
        fr"(?P<read_date_start>{date_pattern}) - (?P<read_date_end>{date_pattern})"
        fr"(?P<due_date>{date_pattern})"
        fr"(?P<statement_date>{date_pattern})")

    dates_match = re.search(dates_pattern, text)
    if not dates_match:
        raise InvalidMeterDataException(
            f"Couldn't parse dates from pdf: {text}")

    _dates = dates_match.group("read_date_start", "read_date_end",
                               "statement_date")
    start_date, end_date, statement_date = [
        parse_date(_date).date() for _date in _dates
    ]

    used_match = re.search(used_pattern, text)
    if not used_match:
        raise InvalidMeterDataException(
            "fCouldn't parse usage from pdf: {text}")

    used_text = used_match.group("units_used")
    used = float(used_text.replace(",", "").replace("$", ""))

    cost_match = re.search(cost_pattern, text)
    if not cost_match:
        raise InvalidMeterDataException(
            f"Couldn't parse cost from pdf: {text}")

    cost_text = cost_match.group("water_charges")
    cost = float(cost_text.replace(",", "").replace("$", ""))

    if config.enabled("S3_BILL_UPLOAD"):
        key = hash_bill(account_id, start_date, end_date, cost, 0, used)
        with open(pdf_filename, "rb") as pdf_data:
            attachments = [
                upload_bill_to_s3(
                    BytesIO(pdf_data.read()),
                    key,
                    source="customerconnect.poway.org",
                    statement=statement_date,
                    utility="utility:city-of-poway",
                    utility_account_id=account_id,
                )
            ]
    else:
        attachments = []
    return BillingDatum(
        start=start_date,
        end=end_date - timedelta(days=1),
        statement=statement_date,
        cost=cost,
        peak=None,
        used=used,
        items=None,
        attachments=attachments,
        utility_code=None,
    )