def collect_earnings_over_time(self, playerNum, file_obj, default_url, str_year, searchedPlayers): for i in tqdm(range(1, playerNum + 1)): player = WebDriverWait(self.browser, 20).until( expected_conditions.visibility_of_element_located( (By.XPATH, "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" + str(i) + "]/td[4]/a"))) file_obj.write(player.text + ",") WebDriverWait(self.browser, 20).until( expected_conditions.visibility_of_element_located( (By.XPATH, "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" + str(i) + "]/td[4]/a"))).click() url = self.browser.current_url bio_url = from_overview_to_bio(url) playerActivity_url = from_overview_to_playerActivity(url, str_year) bio_scraper = Scraper(bio_url) playerActivity_scraper = Scraper(playerActivity_url) marital_status = bio_scraper.check_player_marital_status() file_obj.write(marital_status + ",") tournament_earnings_str = playerActivity_scraper.tournament_earnings( ) file_obj.write(tournament_earnings_str + ",") file_obj.write(str(i) + ",") file_obj.write(str_year + ",") file_obj.write("\n") self.browser.get(default_url)
def test_get_product_price(self): # Test scraping an amazon product's price url = 'https://www.amazon.com/AOC-U2790VQ-3840x2160-Frameless-DisplayPort/dp/B07LBM2DCC/ref=pd_rhf_se_p_img_10?_encoding=UTF8&psc=1&refRID=Q4E347Q2WW3WKJJSF5RZ' priceSelector = 'span#priceblock_saleprice, span#priceblock_ourprice, span#priceblock_dealprice' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), 0) url = 'https://www.amazon.com/dp/B074PK4R2H/ref=psdc_1292115011_t2_B07LBM2DCC' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '263.00') # Has 'See price in cart' instead of actual price url = 'https://www.amazon.com/Xbox-All-Digital-Console-Disc-Free-Gaming/dp/B07XQXZXJC/ref=sr_1_1?keywords=xbox&qid=1584756939&s=electronics&sr=1-1' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '174.95') url = 'https://www.amazon.com/YI-Waterproof-Surveillance-Detection-Deterrent/dp/B01CW49AGG/ref=zg_bs_photo_home_2?_encoding=UTF8&psc=1&refRID=5P9471P94RVGPXPFEBDT' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '45.88') url = 'https://www.amazon.com/LORGDFDF-Microphone-Creative-Integrated-Bluetooth/dp/B0851WN4RG/ref=sr_1_1?keywords=howl+conference&qid=1584773845&s=electronics&sr=1-1' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '113.75') url = 'https://www.amazon.com/Generation-Dell-Corei7-9750H-GeForce-InfinityEdge/dp/B07T3FWD22/ref=sr_1_1?keywords=xps&qid=1584773944&s=electronics&sr=1-1' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), 0) url = 'https://www.amazon.com/PlayStation-Portable-Core-PSP-1000-sony/dp/B000F2DE8S/ref=sr_1_10?keywords=psp&qid=1584774683&s=electronics&sr=1-10' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), 0) url = 'https://www.amazon.com/AmazonBasics-Pound-Neoprene-Dumbbells-Weights/dp/B01LR5RO5U?ref_=ast_sto_dp&th=1&psc=1' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), 0) url = 'https://www.amazon.com/Primitives-Kathy-Sign-3-Inch-Love/dp/B00HU7WRZC/ref=bbp_bb_5e8416_st_8174_w_0?psc=1&smid=ATVPDKIKX0DER' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '6.43') url = 'https://www.amazon.com/dp/B07PWCWQ4Z/ref=cm_gf_aagc_iaaa_d_p0_qd0____________________cYfsB7BVGRjKYIoPUijy' scraper = Scraper(url) self.assertEqual(scraper.getProductPrice(priceSelector), '35.99')
def bfs(self): # this to be commented later # searchScraper = SearchScraper() scraper = Scraper() root = self.root dq = collections.deque() dq.appendleft(root) visited = set() i = 1 while dq: # this will be removed in future if i == 30: return if i % 15 == 0: print( "Hey!! We are gonna pull a little sneaky on Google. I am starting a new session" ) del scraper scraper = Scraper() curr = dq.popleft() if curr in curr.children: curr.children.remove(curr) #Remove child from itself print(curr.name) if (curr is not self.root) and (not curr._visited): i += 1 links = curr.get_links(scraper) with open("search_links.txt", "a+") as f: f.write(curr.name + "\n") f.write(str(links) + "\n") sleep(randint(2, 10)) else: print("already-visited") currCh = curr.children filteredCh = set() for child in currCh: if child.parent != curr.name: continue else: dq.appendleft(child) filteredCh.add(child) curr.children = filteredCh
def __init__(self): if not os.path.exists("logs/"): os.makedirs("logs/") logging.config.dictConfig(logging_config) self.logger = logging.getLogger(__name__) FORMAT_ERROR = "scraper_settings format is invalid, please refers to project doc" try: self.interval = float(scraper_settings["scraping_interval"]) self.do_email_alert = scraper_settings["do_email_alert"] start_date_parts = scraper_settings["start_date"].split('-') end_date_parts = scraper_settings["end_date"].split('-') self.scraper = Scraper( date(int(start_date_parts[0]), int(start_date_parts[1]), int(start_date_parts[2])), date(int(end_date_parts[0]), int(end_date_parts[1]), int(end_date_parts[2])), scraper_settings["do_email_alert"], scraper_settings["city"] ) except: raise ValueError(FORMAT_ERROR) self.alert = SeatAlertEmailer()
def crawl(timeout, crawl_delay, threadID): """ This method will do the crawling on a URL. In a (almost-)never-ending loop, it will try to get a URL out of the frontline queue, it will crawl it and add new URLs to the frontline queue. Parameters ---------- timeout: time object this is the timeout time, past which crawling will be stopped crawl_delay: int timeout between two crawling attempts threadID: int, str this is the ID of the thread that is using this method. for logging purposes only. """ while True: if time.time() > timeout: break if frontlineQ.empty(): time.sleep(crawl_delay) continue url = frontlineQ.get() try: print 'Thread', threadID, ' scraping ', url sc = Scraper(url) sc.writeJSON(jsons_dir) outgoings = sc.getBookLinks() crawledQ.put(url) for u in outgoings: if not crawledQ.contains(u): frontlineQ.put(u) except: crawledQ.put(url) frontlineQ.task_done() time.sleep(crawl_delay / 2)
def main(): """main function""" try: parser = argparse.ArgumentParser() parser.add_argument("departure_IATA", type=validation_iata, help="Specify IATA-code of your departure airport") parser.add_argument("destination_IATA", type=validation_iata, help="Specify IATA of your destination airport") parser.add_argument("outbound_date", type=validation_date_str, help="Specify outbound date") parser.add_argument("return_date", nargs="?", default=0, type=validation_date_str, help="If you don't want fly oneway," \ " specify return date") args = parser.parse_args() if check_dates(args.outbound_date, args.return_date): srap = Scraper(args.departure_IATA, args.destination_IATA, args.outbound_date, args.return_date) srap.make_search() return 0 except (ScraperError, ValidationError) as err: sys.stderr.write(err.value) return 1
def create_job(): worker = Scraper(FREQUENCY, TAB, UNIT, FINAL_YEAR) while True: item = q.get() worker.do_work(item) print(item + ' is downloaded | ' + str(q.qsize()) + ' item(s) left') q.task_done()
def __init__(self): '''Initializes bot class.''' self.scraper = Scraper() self.validator = Validator() self.courier = Courier() self.loopInterval = 300 # In seconds self.currentMode = Environment.TEST # This is set in self.ModeSelect() method in the run method. I have it set to Environment.TEST default incase someone removes self.modeSelect().
def setUpClass(start): city = "wroclaw" rooms = "two" price_start = "1400" price_end = "2000" page = "1" start.scraper_main = Scraper(city,rooms,price_start,price_end,page)
def handler(event, context): """ """ base_url = event['url'] queries = event['queries'] scraper = Scraper(base_url) response = scraper.start(queries) return response # event = \ # { # "url": "https://www.monster.com", # "queries": [{"keyword":"Data Science", "location":"New YOrk"}] # } # handler(event, 2) # import json # def handler(event, context): # response = { # "statusCode": 200, # "body": json.dumps({'message':'Successfully scraped.'}) # } # return response
def test_get_details_wrong_url(): """ Test case: When given wrong url """ url = 'https://nbgdhckkdoo.com' with pytest.raises(URLLoadError): product = Scraper(url, BasicExtractor()).get_details([NameAttribute])
def test_get_sentence_translation_and_audio(): sentence = '他会说很多种语言,比如汉语,日语,德语。' s = Scraper() retval = s.get_sentence_translation_and_audio(sentence) assert retval[0] == \ 'He can speak many languages, such as Chinese, Japanese and German.' assert retval[1] # (is nonempty)
def __init__(self, start_url, goal_url): """Saving two urls to link.""" super(Problem, self).__init__() self.start_url = start_url self.goal_url = goal_url self.scraper = Scraper() self.base_url = 'https://fr.wikipedia.org'
def update_stock_data(ticker): """ Finds the stock data for a given ticker, and updates/inserts it into the database. :param ticker: str """ scraper = Scraper() stock = get_stock(ticker) if not stock: name = scraper.get_stock_name(ticker) add_stock(name, ticker) stock = get_stock(ticker) # NOTE: eps_growth only ever contains four numbers, so skip the first spot avg_eps_growth = sum(scraper.get_eps_growth(ticker)[1:]) / 4 qoq_eps_growth = scraper.get_qoq_growth(ticker, 'eps') # NOTE: sales_growth only ever contains four numbers, so skip the first spot avg_sales_growth = sum(scraper.get_sales_growth(ticker)[1:]) / 4 qoq_sales_growth = scraper.get_qoq_growth(ticker, 'revenue') stock.avg_eps_growth = avg_eps_growth stock.qoq_eps_growth = qoq_eps_growth stock.avg_sales_growth = avg_sales_growth stock.qoq_sales_growth = qoq_sales_growth update_stock(stock) update_mos(ticker) update_eps_data(ticker) update_revenue_data(ticker)
def gather(): print('here') logger.info("gather") storage = Persistor(SCRAPPED_FILE) scrapper = Scraper(storage) for year in range(1903, int(datetime.datetime.now().year)): scrapper.scrape(year)
def calculate(body): url = json.loads(body)['url'] scraper = Scraper() result = scraper.scrape(url.strip()) time.sleep(10) j = json.dumps(result.__dict__) publish_result(j)
def test_Al2O3_scraper(self): """Tests whether all 4 records for "Al2O3" are scraped correctly.""" # Initialize a `scraper.Scraper` instance. scraper = Scraper() scraper.get_landing_page() # Is the landing page correct? self.assertIn('NIST-JANAF Thermochemical Tables', scraper.browser.title) # Enter "Al2O3" in the form, submit it. scraper.send_query('Al2O3') scraper.select_state() scraper.submit_query() # Get all records resulting from the above query query_records = scraper.all_query_records # Verify number of records scraped. self.assertEqual(len(query_records), 4) # Check if scraped data is OK. self.assertIn('aluminum_oxide__kappa', query_records) self.assertEqual(query_records['aluminum_oxide__alpha']['CAS'], '1344-28-1') self.assertEqual(query_records['aluminum_oxide__delta']['formula'], 'Al2O3') self.assertEqual(query_records['aluminum_oxide__gamma']['link'], 'http://kinetics.nist.gov/janaf/html/Al-098.txt') # Terminate the session cleanly. scraper.terminate_session()
def execute_scraper(self, container): try: result = Scraper(self, container).execute() except: self.logger.exception("Unknown exception ocurred in scraper") return False print("Container {} extracted in {} seconds.".format(container['code'], result[1])) # In case an unknown exception ocurred, finish execution if result[0] is None: return False # In case there was an error scraping a container, restart the driver if result[0] is False: # Add to failure count print("Scraper for container {} was unsuccessful".format(container['code'])) self.fail_counter += 1 # Create new driver self.create_driver(True) if self.fail_backoff <= self.MAX_BACKOFF: self.fail_backoff *= 2 # Continue execution return True # In case no error was found, add to scraper count and restart failure backoff self.fail_backoff = 1 self.total_counter += 1 self.round_counter += 1 if self.round_counter >= ScraperConfig.ROUNDS_RESTART: self.create_driver(False) self.round_counter = 0 return True
def init_weather_ui(self): # get address from text entry box self.address = self.address_entry_box.get( '1.0', 'end-1c') if self.address is None else self.address # clear canvas self.clear_canvas() # set up scraper self.web_scraper = Scraper(self.address) # create text objects to display location and weather self.location_text = self.canvas.create_text( 500, 100, text=self.web_scraper.get_location()) if self.web_scraper.get_location() != 'Error: Invalid Address': self.forecast_text = self.canvas.create_text( 500, 115, text=self.web_scraper.get_forecast()) self.temp_c_text = self.canvas.create_text( 500, 130, text=self.web_scraper.get_temp_f()) self.temp_f_text = self.canvas.create_text( 500, 145, text=self.web_scraper.get_temp_c()) # initialize reset button self.reset_button = tk.Button(self.canvas, text='Reset Slot', bg='white', command=self.init_entry_ui) self.reset_button.place(width=300, height=100, x=175, y=300) #initialize refresh button self.refresh_button = tk.Button(self.canvas, text='Refresh Forecast', bg='white', command=self.refresh_weather) self.refresh_button.place(width=300, height=100, x=525, y=300)
def collect_basic_info(self, playerNum, file_obj, default_url, str_year, searchedPlayers): for i in tqdm(range(1, playerNum + 1)): player = WebDriverWait(self.browser, 20).until( expected_conditions.visibility_of_element_located( (By.XPATH, "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" + str(i) + "]/td[4]/a"))) player_name = player.text if player_name in searchedPlayers: self.browser.get(default_url) else: file_obj.write(player_name + ",") WebDriverWait(self.browser, 20).until( expected_conditions.visibility_of_element_located(( By.XPATH, "//*[@id='rankingDetailAjaxContainer']/table/tbody/tr[" + str(i) + "]/td[4]/a"))).click() url = self.browser.current_url info_scraper = Scraper(url) basic_info = info_scraper.find_basic_info() file_obj.write(basic_info["turned_pro"] + ",") file_obj.write(basic_info["weight"] + ",") file_obj.write(basic_info["height"] + ",") file_obj.write("\n") searchedPlayers.append(str(player_name)) self.browser.get(default_url)
def test_elements_found(): obj = Scraper("https://www.google.com/", "bar.csv", ["foo1", "foo2", "foo3"], True) obj.go_to(obj.base_url) elements = WebDriverWait(obj.webdriver, 3).until( ElementHasCssSelector("input[value*= 'Feeling Lucky']") ) assert elements
def main(): questions = [ inquirer.List('user_option', message="in order to proceed please choose an option", choices=["scraper", "immediate data"] ) ] question_scraper = [ inquirer.Text('symbol', message='if you like to scrape a specific symbol enter the the symbol\nplease please enter "ALL" for all the symbol ', ), inquirer.Text('saving flag', message='if you like to save the data locally please add True (default - False) ' ), ] answers = inquirer.prompt(questions) if answers.get("user_option") == "scraper" : # TODO fix scraper - cannot scraped data while choosing "scraper" answer_scraper = inquirer.prompt(question_scraper) symbol = answer_scraper.get("symbol") saving_flag = answer_scraper.get("saving flag") scraper = Scraper(save=saving_flag) scraper.scrape_all(symbol_choice=symbol) elif answers.get("user_option") == "immediate data" : get_data_from_api()
def scrape(event, context): driver = Scraper() page = driver.scrape_page('https://waitbutwhy.com/') # Business logic for specific scrape job post = page.find("div", {"class": "mainPost"}) header = post.find("h1") link = header.find('a', href=True) if link: data = { "success": "true", "result": { "message": "Congrats!! Your Headless Chrome initialized and we found the top story on Wait But Why", "topStoryLink": link['href'] } } else: data = { "success": "false", "result": { "message": "Oops, something went wrong" } } driver.close() driver.quit() response = {"statusCode": 200, "body": json.dumps(data)} return response
def __init__(self, look_ahead=4, skip_to_page=0, feats=10, max_q_sz=100, base_url="http://www.xvideos.com/c/{0}/anal-12"): # Let's set this up so gathering new videos can happen in the background. self.scraped_videos = {} gather_args = (look_ahead, skip_to_page, feats, max_q_sz) self.gather_process = Thread(target=self.gather, args=gather_args, daemon=True) self.scr = Scraper(base_url=base_url, pg_n=skip_to_page) self.db = Database() self.ai = Brain(self) self.win = Window(self) self.currently_loaded_video_data = {} self.feats = feats self.q = PriorityQueue(maxsize=max_q_sz) self.lock = RLock() if "brain.pkl" in os.listdir(): self.train() self.get_next()
def run_once(self): json_file = open("config/campusdual.json") data = json.load(json_file) username = data["username"] password = data["password"] worker = Scraper(username) login = worker.login(password) print("login result", login) if login != 0: worker.exit() exit(login) if self.killer.kill_now: print("login successful, but program is being terminated. not downloading schedule. NOT pushing to calendar. exiting") worker.exit() exit() worker.download_full_schedule() schedule_fixer.repair("data/" + username + "/schedule.json", "data/" + username + "/schedule-fixed.json") if self.killer.kill_now: print("download completed, but program is being terminated. NOT pushing to calendar. exiting") worker.exit() exit() worker.exit() calendar = calendar_api.CalendarApi() f = open("data/" + username + "/schedule-fixed.json", "r") sch = json.load(f) f.close() calendar.sync_schedule([s for s in sch if s["date"] not in FORBIDDEN_DATES])
class MainApp(): if __name__ == '__main__': startUrl = "http://scn.sap.com/community/abap/content?filterID=contentstatus[published]~objecttype~objecttype[thread]&start=" storing = DataStoring() #read the input param i = storing.read_index_from_file() completeUrl = "" print("\n\n-------- SCRAPER STARTED ---\n") while (i < 5000): #string concatenation to get the complete URL completeUrl = startUrl + str(20 * i) #threads scraped from URL threads = [] print("------ SCRAPING NEW WEB PAGE (PAGE " + str(i) + ") ---\n") SCNScraper = Scraper(completeUrl) #get threads threads = SCNScraper.scraping() #save content into json file storing.insert_items_into_file(threads) #save content into db storing.insert_items_into_db(threads) i = i + 1 #update index file storing.write_index_into_file(i)
def main(): style = "=+" * 20 if url_checker(args.url) is False: # url format check raise argparse.ArgumentTypeError( 'Value has to be in full url format http:// or http://') print(style) print("Box.com PDF Downloader by @lfasmpao") box_object = Scraper(args.url, args.driver_location, args.use_x11, args.wait_time) print("Please wait for about {} seconds...".format(args.wait_time)) box_object.load_url() dl_name = box_object.get_download_title() print(style) print("DATA TO BE DOWNLOADED\nTitle: {}\nBox.com URL: {}".format( dl_name, args.url)) print(style) dl_url = box_object.get_download_url() print("Download URL:", dl_url) print(style) box_object.clean() # clean # make directory directory = os.path.dirname(args.output_location) if not os.path.exists(directory): os.makedirs(directory) print("Downloading..\nFile will be save as:", str(args.output_location + dl_name + ".pdf")) download_file(url=dl_url, path=str(args.output_location + dl_name + ".pdf"))
def process(self, json_path, initial_url=INITIAL_URL, location="local"): """process the data. :param json_path, String :param initial_url, String :param location, String """ rules = self.__get_local_json_data(json_path) item_parser = ItemParser(rules[str(self._initial_action)]) scraper = Scraper(auth="auth", user="******", passw="Scraper", initial_url=initial_url) response = scraper.start_request() current_page = self._initial_action while True: scraped_item = scraper.parse_item(response, item_parser) if scraped_item is None: print( "ALERT - Can’t move to page {prev_page}: page {current_page} link has been malevolently tampered with!!" .format(prev_page=item_parser.get_next_parser(), current_page=current_page)) break print("Move to page {current_page}".format( current_page=current_page)) next_parser = scraped_item['next_parser'] next_url = scraped_item['next_url'] item_parser = ItemParser(rules[next_parser]) response = scraper.start_request(url=next_url) current_page = next_parser
def main(): logger = VerboseScraperLogger() scraper = Scraper(database=DATABASE, logger=logger, num_threads=THREADS, force=FORCE) scraper.scrape(url=SEARCH_URL)
def main(): categories = [] main_url = 'https://www.allrecipes.com/recipes/' # scrape the list of categories to scrape recipes for source = requests.get(main_url).text soup = BeautifulSoup(source, 'lxml') category_containers = soup.find_all('div', class_='all-categories-col') for container in category_containers: for section in container.find_all('section'): title = section.h3.text title = '-'.join(title.lower().split()) for li in section.ul.find_all('li'): cat_name = li.a.text cat_name = '-'.join(cat_name.lower().split()) cat_url = li.a['href'] categories.append({'title': title, 'category': cat_name, 'url': cat_url}) # Create a list of scrapers max_recipe_num = 30 for cat in categories: scraper = Scraper(cat['title'] + '__' + cat['category'], max_recipe_num=max_recipe_num) scraper.get_list_of_categories(cat['url']) scraper.parse_category_list_for_recipes() scraper.save_to_csv('./data/')