def bot_mitigation(webdriver): """ performs three optional commands for bot-detection mitigation when getting a site """ # bot mitigation 1: move the randomly around a number of times window_size = webdriver.get_window_size() num_moves = 0 num_fails = 0 while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES: try: if num_moves == 0: #move to the center of the screen x = int(round(window_size['height']/2)) y = int(round(window_size['width']/2)) else: #move a random amount in some direction move_max = random.randint(0,500) x = random.randint(-move_max, move_max) y = random.randint(-move_max, move_max) action = ActionChains(webdriver) action.move_by_offset(x, y) action.perform() num_moves += 1 except MoveTargetOutOfBoundsException: num_fails += 1 #print "[WARNING] - Mouse movement out of bounds, trying a different offset..." pass # bot mitigation 2: scroll in random intervals down page scroll_down(webdriver) # bot mitigation 3: randomly wait so that page visits appear at irregular intervals time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))
def bot_mitigation(webdriver): """ performs three optional commands for bot-detection mitigation when getting a site """ # bot mitigation 1: move the randomly around a number of times window_size = webdriver.get_window_size() num_moves = 0 num_fails = 0 while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES: try: if num_moves == 0: #move to the center of the screen x = int(round(window_size['height'] / 2)) y = int(round(window_size['width'] / 2)) else: #move a random amount in some direction move_max = random.randint(0, 500) x = random.randint(-move_max, move_max) y = random.randint(-move_max, move_max) action = ActionChains(webdriver) action.move_by_offset(x, y) action.perform() num_moves += 1 except MoveTargetOutOfBoundsException: num_fails += 1 #print "[WARNING] - Mouse movement out of bounds, trying a different offset..." pass # bot mitigation 2: scroll in random intervals down page scroll_down(webdriver) # bot mitigation 3: randomly wait so that page visits appear at irregular intervals time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))
def click_on_links(browser_params, manager_params, driver, odict, number_of_results_to_click): """Click on n number of links, depends on value of number_of_results_to_click Args: number_of_results_to_click: number of links to click; can be postive or negative; + positive, i.e., number_of_results_to_click=3: click on top 3 most expensive products + negavie , i.e. number_of_results_to_click=-3, click on top 3 cheapest products """ logger = loggingclient(*manager_params['logger_address']) # get number_of_results_to_click (-th top cheapest or -th most expensive links and ) most_expensive = price_utilities.get_top_n_most_expensive_product( odict, number_of_results_to_click) #print(most_expensive) # to store which links are click, and prices of those links clicked_link_dict = collections.OrderedDict() for k in most_expensive.keys(): # i.e,. k = shopping/product/12137993817137030622@_@Garden of Life Perfect Food Raw Organic Green Super Food Powder - 8.5 oz jar', 30.25 link_text = k.split('@_@')[-1].strip() if link_text.endswith('...'): link_text = link_text[:-4].strip() # print(link_text) try: if len(driver.find_elements_by_partial_link_text(link_text)) < 1: continue driver.find_elements_by_partial_link_text(link_text)[-1].click() logger.info('BROWSER %i: Click on: %s' % (browser_params['crawl_id'], link_text)) #logger.debug("Clic") time.sleep(3) driver.find_elements_by_partial_link_text(link_text)[-1].click( ) # this is the trick of google, must click twice time.sleep(3) scroll_down(driver) time.sleep(3) clicked_link_dict[k] = most_expensive[k] driver.back() except Exception, e: print(e) traceback.print_exc() finally:
def click_links_in_training_sites(manager_params, webdriver, browser_params, odict, train=True, no_click=False, average_price=2000000): """click on a link if it is on training website's list Args: - no_click : True, function returns imediately - odict: dictionary of get_all_products_prices_links(html_content) - train: default is true: True if this is training phase, False if testing phase - if train is True: click on the link when link inside visted website list, otherwise not Return: - a dictionary of links which are inside visited website list by default, does not click on any links """ logger = loggingclient(*manager_params['logger_address']) clicked_link_dict = collections.OrderedDict() if no_click is True: # do not click return clicked_link_dict driver = webdriver links_dict = odict # price_utilities.get_all_products_prices_links(html_content) links = [key.lower() for key in links_dict.keys() ] # .split("@_@")[0] {google.com@_@Google Search@_@20}, key is 20 #print("list of training site:") _url = webdriver.current_url training_sites = [ site.lower() for site in open(browser_params["training_sites"]) ] #print(training_sites) #print(links) #import time #time.sleep(211) nu = 0 # click only for 2 links for site in training_sites: for link in links: if site in link: if train is False: clicked_link_dict[link] = links_dict[link] if train is True: if links_dict[link] < average_price: return #self.my_log.debug('visit this %s',link) try: if nu > 1: # only click on the first link which is also inside the training sites break nu = nu + 1 if link.endswith('...'): link = link[:-4].strip() driver.find_elements_by_partial_link_text( link.split("@_@")[-1])[-1].click() time.sleep(5) #.my_log.debug('visite step 2:2nd click') try: driver.find_elements_by_partial_link_text( link.split("@_@")[-1])[-1].click() except TimeoutException: driver.back() time.sleep(5) clicked_link_dict[link] = links_dict[link] scroll_down(driver) time.sleep(2) driver.back() except: driver.get(_url) break return clicked_link_dict