Esempio n. 1
0
def scrape(supplier):
    oldHTML = open("./data/" + supplier + ".html").read()
    #driver = webdriver.Chrome()
    driver = webdriver.Chrome(
        r'C:/Program Files/Chromedriver/chromedriver.exe',
        chrome_options=options)
    driver.get(
        "https://www.energizect.com/compare-energy-suppliers")  # get the page

    if (supplier == "ui"):
        ui_button = driver.find_element_by_id("radioTwo")
        ui_button.click()
    # Click the button
    compare_now_button = driver.find_element_by_class_name(
        "supplier_form_submit")
    compare_now_button.click()

    #TEMPERORARY FOR THE POPUP ABOUT STANDARD PRICES
    try:
        WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located(
                (By.CLASS_NAME, "ui-dialog-titlebar-close")))
        close_button = driver.find_element_by_class_name(
            "ui-dialog-titlebar-close")
        close_button.click()
    except:
        print("no seasonal popup")

    # Wait *up to* 20 seconds for the popup to show up
    try:
        WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located((By.CLASS_NAME, "close_anchor")))
    except:
        email_error.send_email("no close anchor")

    #click the x for a disclaimer
    close_button = driver.find_element_by_class_name("clostPopup")
    close_button.click()

    # Get the html
    html = driver.page_source

    #writing to a file
    soup = bs.BeautifulSoup(html, 'html.parser')
    html = soup.prettify()
    with open("./data/" + supplier + ".html", "w") as out:
        for i in range(0, len(html)):
            try:
                out.write(html[i])
            except Exception:
                1 + 1
    newHTML = open("./data/" + supplier + ".html").read()

    #calculates percentage difference between this and the last relevant HTML file
    matcher = SequenceMatcher(None, oldHTML, newHTML).quick_ratio()
    if matcher < 0.5:
        email_error.send_email("difference between HTML files is: ", matcher)
    print("percent match:", matcher)
Esempio n. 2
0
def run(supplier):
    try:
        with open("./data/" + supplier + "_PVD.html") as html:
            soup = bs.BeautifulSoup(html, 'html.parser')
        suppliers = []
        fill_suppliers(soup, suppliers)
        write_to_csv(supplier, suppliers)
    except Exception as e:
        error_traceback = traceback.extract_tb(e.__traceback__)
        print("error encountered: " + str(error_traceback))
        email_error.send_email("general error: " + str(error_traceback))
Esempio n. 3
0
def billingCycle(row):
    try:
        mobilerateDiv = row.findAll("div", class_="mobilerate")[1]
    except:
        email_error.send_email("There is no div with value mobilerate.")
    contractTerms = []
    for elem in mobilerateDiv.find("div", class_="companyShortData").contents:
        if "Billing Cycle" in elem:
            contractTerms.append(''.join(x for x in elem if x.isdigit()))
    if len(contractTerms) > 2:
        email_error.send_email("more than two fixed tiers")
    return contractTerms
Esempio n. 4
0
def varRate(row):
    supplyRates = row.findAll("b", class_="supply_rate")
    if supplyRates == []:
        email_error.send_email(
            "Empty array for supply_rate class (no variable rate corresponding to this)."
        )
    rates = []
    for rate in supplyRates:
        stripped = str(rate.contents[0]).replace("\n", "").strip()
        rates.append('{:,.4f}'.format(
            float(''.join(x for x in stripped if x.isdigit() or x == '.')) /
            100))
    return rates
Esempio n. 5
0
def getNum(row, attribute, value):
    try:
        content = str(row.find(attrs={attribute: value}).contents)
    except:
        email_error.send_email(
            "No such attribute and value exist. Attribute: " + attribute +
            " Value: " + value)
    s = ''.join(x for x in content
                if x.isdigit())  #gets all numbers within the contents
    if s:
        return int(s)
    else:
        return 0
Esempio n. 6
0
def diff_check(supplier):
    files = sorted([x for x in os.listdir("./data/") if x.endswith(".csv")],
                   key=lambda x: os.path.getmtime("./data/" + x),
                   reverse=True)
    if len(files) < 2:
        email_error.send_email("not enough files to compare")
        return
    for i in range(len(files)):
        if supplier in str(files[i]):
            now = files[i]
            for j in range(i + 1, len(files)):
                if supplier in str(files[j]):
                    recent = files[j]
                    break
            break
    diff = compare(load_csv(open("./data/" + now), key="plan_id"),
                   load_csv(open("./data/" + recent), key="plan_id"))
    if diff['added'] == [] or diff['removed'] == []:
        os.remove("./data/" + now)
        print('deleted')
Esempio n. 7
0
def fill_suppliers(soup, suppliers):
    table = soup.find_all(
        'table',
        class_="nice_table responsive highlight_table display nowrap")[0]
    first = True
    planNum = 0
    iterator = iter(table.find_all('tr'))
    year = datetime.date.today().year - 1
    duplicate = []
    next(iterator)  #skip first entry, which is a header
    for row in iterator:
        counter = 0
        info = {}
        rowString = str(row)
        info["date_downloaded"] = date.today()
        if row.attrs['style'] == "display: none;":
            continue
        service = getValue(rowString, "data-ratetitle")
        if "Eversource" in service:
            service = "Eversource"
        elif "UI" in service:
            service = "UI"
        info["TDU_service_territory"] = service
        if first:
            info["supplier_name"] = info["TDU_service_territory"]
        elif getValue(rowString, "data-friendly-name") in duplicate:
            # print(getValue(rowString, "data-friendly-name"))
            continue
        else:
            duplicate.append(getValue(rowString, "data-friendly-name"))
            info["supplier_name"] = getValue(rowString, "data-friendly-name")
        info["plan_id"] = getValue(rowString, "id=\"plan-", 0)
        curr_id = info["plan_id"]
        curr_low = soup.find(id="low_value_" + curr_id)
        if curr_low and curr_low['value'].find(str(year)) != -1:
            # print("1")
            indexes = find_all_indexes(curr_low['value'], str(year))
            indexes_2 = find_all_indexes(curr_low['value'], str(year + 1))
            low_list = []
            for i in indexes:
                low_list.append('{0:g}'.format(
                    float(
                        re.findall('\d*\.?\d+',
                                   curr_low['value'][i + 19:i + 24])[0]) /
                    100))
            for i in indexes_2:
                low_list.append('{0:g}'.format(
                    float(
                        re.findall('\d*\.?\d+',
                                   curr_low['value'][i + 19:i + 24])[0]) /
                    100))
            if len(low_list) > 12:
                low_list = low_list[-12:]
            while len(low_list) < 12:
                low_list.append('N/A')
            curr_high = soup.find(id="high_value_" + curr_id)['value']
            indexes = find_all_indexes(curr_high, str(year))
            indexes_2 = find_all_indexes(curr_high, str(year + 1))
            high_list = []
            for i in indexes:
                try:
                    high_list.append('{0:g}'.format(
                        float(
                            re.findall('\d*\.?\d+',
                                       curr_high[i + 19:i + 24])[0]) / 100))
                except Exception as e:
                    email_error.send_email(
                        "Format of website changed, the value of high value is not numeric"
                    )
            for i in indexes_2:
                try:
                    high_list.append('{0:g}'.format(
                        float(
                            re.findall('\d*\.?\d+',
                                       curr_high[i + 19:i + 24])[0]) / 100))
                except Exception as e:
                    email_error.send_email(
                        "Format of website changed, the value of low value is not numeric"
                    )
            if len(high_list) > 12:
                high_list = high_list[-12:]
            while len(high_list) < 12:
                high_list.append('N/A')
            for i in range(12):
                info["Low_lag" + str(i + 1)] = low_list[11 - i]
                info["High_lag" + str(i + 1)] = high_list[11 - i]
            planNum += 1
            first = False
            if 0 not in low_list:
                suppliers.append(Supplier(info))
Esempio n. 8
0
def scrape(supplier):

    #driver = webdriver.Chrome()
    driver = webdriver.Chrome(
        r'C:/Program Files/Chromedriver/chromedriver.exe',
        chrome_options=options)

    driver.get(
        "https://www.energizect.com/compare-energy-suppliers")  # get the page

    if (supplier == "UI"):
        ui_button = driver.find_element_by_id("radioTwo")
        ui_button.click()
    # Click the button
    compare_now_button = driver.find_element_by_class_name(
        "supplier_form_submit")
    compare_now_button.click()

    # Wait *up to* 10 seconds to make sure the page has finished loading (check that the button no longer exists)
    # WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CLASS_NAME, "supplier_form_submit")))

    #TEMPERORARY FOR THE POPUP ABOUT STANDARD PRICES
    try:
        WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located(
                (By.CLASS_NAME, "ui-dialog-titlebar-close")))
        close_button = driver.find_element_by_class_name(
            "ui-dialog-titlebar-close")
        close_button.click()
    except:
        print("no seasonal popup")

    # Wait *up to* 20 seconds for the popup to show up
    try:
        WebDriverWait(driver, 20).until(
            EC.visibility_of_element_located((By.CLASS_NAME, "close_anchor")))
    except:
        email_error.send_email("no close anchor")

    #click the x for a disclaimer
    close_button = driver.find_element_by_class_name("clostPopup")
    close_button.click()
    action = ActionChains(driver)

    lists = driver.find_elements_by_class_name("compare_button1")
    count = 0
    try:
        oldHTML = open("./data/" + supplier + "_PVD.html").read()
    except Exception:
        1 + 1
        print("didn't find old")
    for test_button1 in lists:
        try:
            action.move_to_element(test_button1).perform()
            test_button1.click()
        except Exception:
            time.sleep(5)
            print("slept")
        count += 1
        # print(count)

    html = driver.page_source

    #writing to a file
    soup = bs.BeautifulSoup(html, 'html.parser')
    html = soup.prettify()
    print(html)
    with open("./data/" + supplier + "_PVD.html", "w") as out:
        for i in range(0, len(html)):
            #print(html[i])
            try:
                out.write(html[i])
            except Exception:
                1 + 1
                print("exception 1+1")
    newHTML = open("./data/" + supplier + "_PVD.html").read()
    matcher = SequenceMatcher(None, oldHTML, newHTML).quick_ratio()
    if matcher < 0.5:
        email_error.send_email("difference between HTML files is: ", matcher)
    print("percent match:", matcher)
Esempio n. 9
0
    # Check whether we have already scraped past variable rates today **note: shoudl add diff checker at some pt and will have to change this
    scraped = os.path.exists(
            "./data/"+ "PVD_ES_"+ str(Dt.date.today()) + ".csv") and os.path.exists(
                    "./data/"+ "PVD_UI_"+ str(Dt.date.today()) + ".csv")
    
    # Scrape and parse past variable rates (if the hour is 5am or if the previous attempt failed)
    if (x.hour >= 5) and not(scraped):
        pvd_total()

# Run all and send email with traceback if any unknown errors occur

try:
    run_all()
    if not Path('run_history.txt').is_file():
        with open('run_history.txt', 'w') as run_file:
            run_file.write(dt.today().strftime('%m/%d/%y %H:%M:%S'))
    else:
        with open('run_history.txt', 'a', newline='') as run_file:
            run_file.write("\n" + dt.today().strftime('%m/%d/%y %H:%M:%S'))
except Exception as e:
    error_traceback = traceback.extract_tb(e.__traceback__)
    email_error.send_email(error=f"Traceback at {dt.today().strftime('%m/%d/%y %H:%M:%S')} from Scheduler: {error_traceback}")

# OLD
#from threading import Timer
# Specify when to run the past variable rates scraper
#y=x.replace(day=x.day+1, hour=6, minute=0, second=0, microsecond=0)
#delta_t=y-x
#secs=delta_t.seconds+1
#t = Timer(secs, pvd_total())
#t.start()