class BasePage(object): def __init__(self): self.driver = DriverHelper().get_driver() self.wait = WebDriverWait(self.driver, 30) def wait_for_element_to_be_visible(self, locator): return self.wait.until(EC.presence_of_element_located(locator)) def is_element_visible(self, locator): # this method is waiting for the element to appear only 2 seconds # It's done to speed up script # This method could be used only when you are 100% sure that page elready loaded wait = WebDriverWait(self.driver, 2) try: wait.until(EC.presence_of_element_located(locator)) return True except TimeoutException: return False def wait_for_element_and_get_text(self, locator): return self.wait.until(EC.presence_of_element_located(locator)).text def click_on_element(self, locator): print("clicking on element with locator: {0} {1}".format( locator[0], locator[1])) element = self.wait.until(EC.presence_of_element_located(locator)) self.driver.execute_script("arguments[0].click();", element) def get_all_elements(self, locator): return self.driver.find_elements(locator[0], locator[1]) def wait_for_one_element_to_be_presented(self, locator1, locator2): self.wait.until( lambda driver: driver.find_elements(locator1[0], locator1[ 1]) or driver.find_elements(locator2[0], locator2[1]))
def open_it_via_url(search_term): # TODO: remover hardcoded link and move parameters to config # here is hardcoded link for search with parameters: # -sale type: Sofort-kaufen # -state: Gebraucht # -sorting: by publishing date (new first) # -category: Handy & Smartphone SEARCH_LINK_PHONE = "https://www.ebay.de/sch/i.html?_from=R40&_nkw={search_term}&_sacat=9355&LH_BIN=1&_sop=10&rt=nc&LH_ItemCondition=3000" driver = DriverHelper().get_driver() driver.get(SEARCH_LINK_PHONE.format(search_term=search_term)) return SearchPage()
def check_active_listings(search_term): flag_old_30_days_old_listings(search_term) mongo_helper = MongoHelper() mongo_helper.start_client_and_connect() # we are taking only active listings, which are not marked as old and or multiitem active_listings = mongo_helper.get_active_singleitem_and_not_flaged_as_old_listings( search_term) print("There are {0} active listings in db".format( active_listings.count())) driver_helper = DriverHelper() driver_helper.start_driver() #TODO: replace threshold with retry after testing number_of_failed_tries_to_scrape_listing = 0 error_threshold = 20 for listing in active_listings.batch_size(10): try: retry(update_active_listing, listing) except Exception as e: trace = traceback.format_exc() print("Failed to scrape the listing {0}".format(listing["link"])) driver_helper.take_screenshot() print(e) print(trace) number_of_failed_tries_to_scrape_listing += 1 if (number_of_failed_tries_to_scrape_listing > error_threshold): raise Exception( "More than {0} scraping failures for the search term {1}". format(error_threshold, search_term)) continue driver_helper.quit_driver() mongo_helper.close_connection()
def update_active_listing(listing): print("Checking listing {0}".format(listing["link"])) driver_helper = DriverHelper() mongo_helper = MongoHelper() try: listing_page = ListingPage.open_url(listing["link"]) except: ListingNotFoundPage() mongo_helper.update_listing(listing["_id"], "page_not_found", True) return today = datetime.datetime.now().strftime('%d. %b. %Y') if listing_page.is_active(): print("The listing {0} page is ACTIVE".format(listing["link"])) print("Updating listing {0}".format(listing["_id"])) mongo_helper.update_listing(listing["_id"], "last_update", today) else: print("The listing {0} page is NOT ACTIVE".format(listing["link"])) closure_date = None try: closure_date = listing_page.get_closure_date() except: # it's fine if the product is just out of stock print( "there is no closure data for this closed listing. Checking if it's just out of stock" ) reason = listing_page.get_closure_reason() print(reason.encode('utf-8')) if (reason != 'Dieser Artikel ist nicht vorrätig.'): raise Exception( "Closure reason is ont 'out of stock' and closur date not provided" ) listing_page.open_original_listing_if_the_link_is_available() closure_reason = listing_page.get_closure_reason() final_price = listing_page.get_price() fianl_shipping_cost = listing_page.get_shipping_cost() print("Updating listing {0}".format(listing["_id"])) mongo_helper.update_listing(listing["_id"], "last_update", today) mongo_helper.update_listing(listing["_id"], "final_price", final_price) mongo_helper.update_listing(listing["_id"], "fianl_shipping_cost", fianl_shipping_cost) mongo_helper.update_listing(listing["_id"], "closure_reason", closure_reason) mongo_helper.update_listing(listing["_id"], "closure_date", closure_date) mongo_helper.update_listing(listing["_id"], "active", False)
def scrape_new_listings(search_term): # open initial search page for search_term with predefined search criterias. See search criterias in SearchPage driver_helper = DriverHelper() driver_helper.start_driver() mongo_helper = MongoHelper() mongo_helper.start_client_and_connect() print("Searching new listings for '{0}' keyword".format(search_term)) search_page = SearchPage.open_it_via_url(search_term) there_are_new_listings = True there_are_more_pages = True while (there_are_new_listings & there_are_more_pages): there_are_new_listings = scrape_search_results_from_the_currenst_search_page( search_term) there_are_more_pages = search_page.go_to_the_next_page_if_available() print( "There are new pages available: {0}\nThere are new listings: {1}". format(there_are_more_pages, there_are_new_listings)) driver_helper.quit_driver() mongo_helper.close_connection()
def scrape_search_results_from_the_currenst_search_page(search_term): search_page = SearchPage() driver_helper = DriverHelper() mongo_helper = MongoHelper() advert_links_and_dates = search_page.get_listing_links_and_dates() there_are_new_listings = True if len(advert_links_and_dates) > 0 else False # Could be that scraping of the listing page will fail because of some edge cases and network issues # I'd like to have info about the fils in the logs, but still continue scraping if the amount of failures is small # TODO: replace threshold with retry after testing number_of_failed_tries_to_scrape_listing = 0 error_per_serch_page_threshold = 2 # total number of listings on the search page is 50 # Sometimes after navigation to the next page we see the last listing form the previos page on it # Adding threshold to avoid false conclusion that there are only already saved listings on the page number_of_existing_listings_on_the_page = 0 existing_listings_on_the_page_threshold = 3 for link_and_date in advert_links_and_dates: print("Listing date: {}".format(link_and_date["date"])) #next line was used for initial run #if (("Mai" in link_and_date["date"]) or ("Apr" in link_and_date["date"])): #return False link = link_and_date["link"] date = "{0} {1}".format(link_and_date["date"], datetime.datetime.now().year) try: print("Scraping listing {0}".format(link)) listing_page = ListingPage.open_url_in_the_new_tab(link) listing = listing_page.get_listing_data() except Exception as e: trace = traceback.format_exc() print("Failed to scrape the listing {0}".format(link)) driver_helper.take_screenshot() print(e) print(trace) number_of_failed_tries_to_scrape_listing += 1 if (number_of_failed_tries_to_scrape_listing > error_per_serch_page_threshold): raise Exception( "More than {0} scraping failures for the search result page" .format(error_per_serch_page_threshold)) driver_helper.close_current_tab_and_go_to_the_first_one() continue listing.creation_date = date listing.search_term = search_term driver_helper.close_current_tab_and_go_to_the_first_one() if ((len(mongo_helper.get_listing_by_id(listing._id)) > 0) & (number_of_existing_listings_on_the_page > existing_listings_on_the_page_threshold)): print("This listing is already in the db") print( "Number of the listings that were already saved in the db on the current search page is {0}" .format(number_of_existing_listings_on_the_page)) there_are_new_listings = False break elif (mongo_helper.get_listing_by_id(listing._id)): print("This listing is already in the db: {0}".format(link)) if (mongo_helper.get_listing_by_id( listing._id)[0]["search_term"] == search_term): number_of_existing_listings_on_the_page += 1 else: print( "This listing was saved in database for the different search term" ) continue mongo_helper.insert_listing(listing) return there_are_new_listings
def __init__(self): self.driver = DriverHelper().get_driver() self.wait = WebDriverWait(self.driver, 30)
def open_url(link): driver = DriverHelper().get_driver() driver.get(link) return ListingPage()
def open_url_in_the_new_tab(link): DriverHelper().open_url_in_the_new_tab(link) return ListingPage()