def scrape(event, context): driver = Scraper() page = driver.scrape_page('https://waitbutwhy.com/') # Business logic for specific scrape job post = page.find("div", {"class": "mainPost"}) header = post.find("h1") link = header.find('a', href=True) if link: data = { "success": "true", "result": { "message": "Congrats!! Your Headless Chrome initialized and we found the top story on Wait But Why", "topStoryLink": link['href'] } } else: data = { "success": "false", "result": { "message": "Oops, something went wrong" } } driver.close() driver.quit() response = {"statusCode": 200, "body": json.dumps(data)} return response
def run(): scraper = Scraper() url = "https://ciudadseva.com/autor/leonora-carrington/cuentos/" text = scraper.get_links(url=url) author = "carrington" with open("links_" + author + ".txt", "w") as f: for link in text: f.write(link + "\n") print("Written links to ", f) scraper.close() return text
def run(*, save=True): author_name = "gudino_kieffer" url = "https://ciudadseva.com/autor/eduardo-gudino-kieffer/cuentos/" links_path = "./datasets/links/links_" scraper = Scraper() text = scraper.get_links(url=url) with open(links_path + author_name + ".txt", "w") as f: for link in text: f.write(link + "\n") print("Written links to ", f) scraper.close() ds = build_text_dataset(links_file=links_path + author_name + ".txt") df = pd.DataFrame(ds) if save: with open("./datasets/" + author_name + "_full_texts.pkl", "wb") as f: pickle.dump(df, f) print("[INFO] Saved dataset in: ", f) return df
def test(): if(len(sys.argv) < 3): raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config") chromedriver = sys.argv[1] elapsed = Elapsed() scraper = Scraper(chromedriver, headless=True) test_url = "https://96hpr.csb.app" try: scraper.open_page(test_url) html = scraper.get_outerhtml( By.XPATH, "/html/body/div/div/table/tbody") parsed = Parser(html, log_each_n=10) template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"] parsed.extract_data_from_table(template, [0], True) print_time(f"Extracted data") finally: scraper.close() elapsed.end()
emit_match(match) db_session.commit() # We don't really need to delete the matches, they're not that big, # but lets flush them when they get fairly large... if len(matches) >= MATCHES_FLUSH_AT: parsed_match_ids = set([match['match_id'] for match in parsed_matches]) stored_match_ids = set(matches.keys()) matches_to_del = stored_match_ids - parsed_matches for match_id in matches_to_del: del matches[match_id] sleep_duration = random.randint( POLL_FREQUENCY_SECS - POLL_FREQUENCY_VARIANCE, POLL_FREQUENCY_SECS + POLL_FREQUENCY_VARIANCE ) time.sleep(sleep_duration) try: s = Scraper(SCRAPE_TARGET) s.open() run(s) finally: s.close()
# -*- coding: utf-8 -*- # filename : microcenter.py # description : Get RAM for LOW prices # author : LikeToAccess # email : [email protected] # date : 08-06-2021 # version : v1.0 # usage : python microcenter.py # notes : # license : MIT # py version : 3.8.2 (must run on 3.6 or higher) #============================================================================== from scraper import Scraper scraper = Scraper(minimize=True) url = { "amazon": "https://www.amazon.com/s?k=OLED65C1PUB&i=electronics&rh=n%3A6463520011%2Cp_n_size_browse-bin%3A1232883011%2Cp_n_feature_nine_browse-bin%3A23478599011%2Cp_89%3ALG%2Cp_n_feature_six_browse-bin%3A2807397011&dc&qid=1633101031&rnid=2807395011&ref=sr_nr_p_n_feature_six_browse-bin_2", "best_buy": "https://www.bestbuy.com/site/searchpage.jsp?id=pcat17071&qp=brand_facet%3DBrand~LG%5Ecategory_facet%3DAll%20Flat-Screen%20TVs~abcat0101001%5Efeatures_facet%3DFeatures~Dolby%20Atmos%5Eparent_tvscreensizeplus_facet%3DTV%20Screen%20Size~65%22%20-%2074%22%5Evoiceassistant_facet%3DVoice%20Assistant%20Built-in~Google%20Assistant&st=6453312" } if __name__ == "__main__": # scraper.microcenter(url) scraper.best_buy(url["best_buy"]) scraper.amazon(url["amazon"]) scraper.close() quit()