コード例 #1
0
def bot_mitigation(webdriver):
    """ performs three optional commands for bot-detection mitigation when getting a site """

    # bot mitigation 1: move the randomly around a number of times
    window_size = webdriver.get_window_size()
    num_moves = 0
    num_fails = 0
    while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES:
        try:
            if num_moves == 0: #move to the center of the screen
                x = int(round(window_size['height']/2))
                y = int(round(window_size['width']/2))
            else: #move a random amount in some direction
                move_max = random.randint(0,500)
                x = random.randint(-move_max, move_max)
                y = random.randint(-move_max, move_max)
            action = ActionChains(webdriver)
            action.move_by_offset(x, y)
            action.perform()
            num_moves += 1
        except MoveTargetOutOfBoundsException:
            num_fails += 1
            #print "[WARNING] - Mouse movement out of bounds, trying a different offset..."
            pass

    # bot mitigation 2: scroll in random intervals down page
    scroll_down(webdriver)

    # bot mitigation 3: randomly wait so that page visits appear at irregular intervals
    time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))
コード例 #2
0
def bot_mitigation(webdriver):
    """ performs three optional commands for bot-detection mitigation when getting a site """

    # bot mitigation 1: move the randomly around a number of times
    window_size = webdriver.get_window_size()
    num_moves = 0
    num_fails = 0
    while num_moves < NUM_MOUSE_MOVES + 1 and num_fails < NUM_MOUSE_MOVES:
        try:
            if num_moves == 0:  #move to the center of the screen
                x = int(round(window_size['height'] / 2))
                y = int(round(window_size['width'] / 2))
            else:  #move a random amount in some direction
                move_max = random.randint(0, 500)
                x = random.randint(-move_max, move_max)
                y = random.randint(-move_max, move_max)
            action = ActionChains(webdriver)
            action.move_by_offset(x, y)
            action.perform()
            num_moves += 1
        except MoveTargetOutOfBoundsException:
            num_fails += 1
            #print "[WARNING] - Mouse movement out of bounds, trying a different offset..."
            pass

    # bot mitigation 2: scroll in random intervals down page
    scroll_down(webdriver)

    # bot mitigation 3: randomly wait so that page visits appear at irregular intervals
    time.sleep(random.randrange(RANDOM_SLEEP_LOW, RANDOM_SLEEP_HIGH))
コード例 #3
0
def click_on_links(browser_params, manager_params, driver, odict,
                   number_of_results_to_click):
    """Click on n number of links, depends on value of number_of_results_to_click
    Args:
        number_of_results_to_click: number of links to click; can be postive or negative;
                + positive, i.e., number_of_results_to_click=3: click on top 3 most expensive products
                + negavie , i.e. number_of_results_to_click=-3, click on top 3 cheapest products
    """
    logger = loggingclient(*manager_params['logger_address'])
    # get number_of_results_to_click (-th top cheapest or -th most expensive links and )
    most_expensive = price_utilities.get_top_n_most_expensive_product(
        odict, number_of_results_to_click)
    #print(most_expensive)
    # to store which links are click, and prices of those links
    clicked_link_dict = collections.OrderedDict()

    for k in most_expensive.keys():
        # i.e,. k = shopping/product/12137993817137030622@_@Garden of Life Perfect Food Raw Organic Green Super Food Powder - 8.5 oz jar', 30.25
        link_text = k.split('@_@')[-1].strip()
        if link_text.endswith('...'):
            link_text = link_text[:-4].strip()
        # print(link_text)
        try:
            if len(driver.find_elements_by_partial_link_text(link_text)) < 1:
                continue
            driver.find_elements_by_partial_link_text(link_text)[-1].click()
            logger.info('BROWSER %i: Click on: %s' %
                        (browser_params['crawl_id'], link_text))
            #logger.debug("Clic")
            time.sleep(3)
            driver.find_elements_by_partial_link_text(link_text)[-1].click(
            )  # this is the trick of google, must click twice
            time.sleep(3)
            scroll_down(driver)
            time.sleep(3)
            clicked_link_dict[k] = most_expensive[k]
            driver.back()
        except Exception, e:
            print(e)
            traceback.print_exc()
        finally:
コード例 #4
0
def click_links_in_training_sites(manager_params,
                                  webdriver,
                                  browser_params,
                                  odict,
                                  train=True,
                                  no_click=False,
                                  average_price=2000000):
    """click on a link if it is on training website's list
    Args:
        - no_click : True, function returns imediately
        - odict: dictionary of get_all_products_prices_links(html_content)
        - train: default is true: True if this is training phase, False if testing phase
        - if train is True: click on the link when link inside visted website list, otherwise not
    Return:
        - a dictionary of links which are inside visited website list
        by default, does not click on any links
    """
    logger = loggingclient(*manager_params['logger_address'])

    clicked_link_dict = collections.OrderedDict()
    if no_click is True:  # do not click
        return clicked_link_dict

    driver = webdriver
    links_dict = odict  # price_utilities.get_all_products_prices_links(html_content)
    links = [key.lower() for key in links_dict.keys()
             ]  # .split("@_@")[0] {google.com@_@Google Search@_@20}, key is 20
    #print("list of training site:")

    _url = webdriver.current_url
    training_sites = [
        site.lower() for site in open(browser_params["training_sites"])
    ]
    #print(training_sites)
    #print(links)
    #import time
    #time.sleep(211)
    nu = 0
    # click only for 2 links
    for site in training_sites:
        for link in links:
            if site in link:
                if train is False:
                    clicked_link_dict[link] = links_dict[link]
                if train is True:
                    if links_dict[link] < average_price:
                        return
                    #self.my_log.debug('visit this %s',link)
                    try:
                        if nu > 1:  # only click on the first link which is also inside the training sites
                            break
                        nu = nu + 1
                        if link.endswith('...'):
                            link = link[:-4].strip()
                        driver.find_elements_by_partial_link_text(
                            link.split("@_@")[-1])[-1].click()
                        time.sleep(5)
                        #.my_log.debug('visite step 2:2nd click')
                        try:
                            driver.find_elements_by_partial_link_text(
                                link.split("@_@")[-1])[-1].click()

                        except TimeoutException:
                            driver.back()
                        time.sleep(5)
                        clicked_link_dict[link] = links_dict[link]
                        scroll_down(driver)
                        time.sleep(2)
                        driver.back()
                    except:
                        driver.get(_url)
                        break
    return clicked_link_dict