def scrape(event, context):
    driver = Scraper()
    page = driver.scrape_page('https://waitbutwhy.com/')

    # Business logic for specific scrape job
    post = page.find("div", {"class": "mainPost"})
    header = post.find("h1")
    link = header.find('a', href=True)

    if link:
        data = {
            "success": "true",
            "result": {
                "message":
                "Congrats!! Your Headless Chrome initialized and we found the top story on Wait But Why",
                "topStoryLink": link['href']
            }
        }
    else:
        data = {
            "success": "false",
            "result": {
                "message": "Oops, something went wrong"
            }
        }

    driver.close()
    driver.quit()

    response = {"statusCode": 200, "body": json.dumps(data)}

    return response
Example #2
0
def run():
    scraper = Scraper()
    url = "https://ciudadseva.com/autor/leonora-carrington/cuentos/"
    text = scraper.get_links(url=url)
    author = "carrington"
    with open("links_" + author + ".txt", "w") as f:
        for link in text:
            f.write(link + "\n")
    print("Written links to ", f)
    scraper.close()
    return text
Example #3
0
def run(*, save=True):
    author_name = "gudino_kieffer"
    url = "https://ciudadseva.com/autor/eduardo-gudino-kieffer/cuentos/"
    links_path = "./datasets/links/links_"

    scraper = Scraper()
    text = scraper.get_links(url=url)
    with open(links_path + author_name + ".txt", "w") as f:
        for link in text:
            f.write(link + "\n")
    print("Written links to ", f)
    scraper.close()

    ds = build_text_dataset(links_file=links_path + author_name + ".txt")
    df = pd.DataFrame(ds)
    if save:
        with open("./datasets/" + author_name + "_full_texts.pkl", "wb") as f:
            pickle.dump(df, f)
            print("[INFO] Saved dataset in: ", f)
    return df
Example #4
0
def test():
    if(len(sys.argv) < 3):
      raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config")

    chromedriver = sys.argv[1]

    elapsed = Elapsed()

    scraper = Scraper(chromedriver, headless=True)
    test_url = "https://96hpr.csb.app"

    try:
        scraper.open_page(test_url)
        html = scraper.get_outerhtml(
            By.XPATH, "/html/body/div/div/table/tbody")
        parsed = Parser(html, log_each_n=10)
        template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]
        parsed.extract_data_from_table(template, [0], True)
        print_time(f"Extracted data")
    finally:
        scraper.close()
        elapsed.end()
Example #5
0
			emit_match(match)

		db_session.commit()

		# We don't really need to delete the matches, they're not that big,
		# but lets flush them when they get fairly large...
		if len(matches) >= MATCHES_FLUSH_AT:
			parsed_match_ids = set([match['match_id'] for match in parsed_matches])
			stored_match_ids = set(matches.keys())

			matches_to_del = stored_match_ids - parsed_matches

			for match_id in matches_to_del:
				del matches[match_id]

		sleep_duration = random.randint(
			POLL_FREQUENCY_SECS - POLL_FREQUENCY_VARIANCE,
			POLL_FREQUENCY_SECS + POLL_FREQUENCY_VARIANCE
		)

		time.sleep(sleep_duration)

try:
	s = Scraper(SCRAPE_TARGET)
	s.open()
	run(s)

finally:
	s.close()
Example #6
0
# -*- coding: utf-8 -*-
# filename          : microcenter.py
# description       : Get RAM for LOW prices
# author            : LikeToAccess
# email             : [email protected]
# date              : 08-06-2021
# version           : v1.0
# usage             : python microcenter.py
# notes             :
# license           : MIT
# py version        : 3.8.2 (must run on 3.6 or higher)
#==============================================================================
from scraper import Scraper

scraper = Scraper(minimize=True)
url = {
    "amazon":
    "https://www.amazon.com/s?k=OLED65C1PUB&i=electronics&rh=n%3A6463520011%2Cp_n_size_browse-bin%3A1232883011%2Cp_n_feature_nine_browse-bin%3A23478599011%2Cp_89%3ALG%2Cp_n_feature_six_browse-bin%3A2807397011&dc&qid=1633101031&rnid=2807395011&ref=sr_nr_p_n_feature_six_browse-bin_2",
    "best_buy":
    "https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=brand_facet%3DBrand~LG%5Ecategory_facet%3DAll%20Flat-Screen%20TVs~abcat0101001%5Efeatures_facet%3DFeatures~Dolby%20Atmos%5Eparent_tvscreensizeplus_facet%3DTV%20Screen%20Size~65%22%20-%2074%22%5Evoiceassistant_facet%3DVoice%20Assistant%20Built-in~Google%20Assistant&st=6453312"
}

if __name__ == "__main__":
    # scraper.microcenter(url)
    scraper.best_buy(url["best_buy"])
    scraper.amazon(url["amazon"])
    scraper.close()
    quit()