def browse_website(url, num_links, webdriver, proxy_queue, browser_params):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params)

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        print "BROWSE: visiting link to %s" % links[r].get_attribute("href")
        
        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Ejemplo n.º 2
0
def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))
        
        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Ejemplo n.º 3
0
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
                   browser_params, manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1,sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params)

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links) - 1)
        print "BROWSE: visiting link to %s" % links[r].get_attribute("href")

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Ejemplo n.º 5
0
def browse_website(url, num_links, webdriver, proxy_queue, browser_params,
                   manager_params, extension_socket):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links) - 1)
        logger.info(
            "BROWSER %i: visiting internal link %s" %
            (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Ejemplo n.º 6
0
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
                   browser_params, manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params,
                extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links))
        logger.info(
            "BROWSER %i: visiting internal link %s" %
            (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
            wait_until_loaded(webdriver, 300)
        except Exception:
            pass
Ejemplo n.º 7
0
def browse_and_dump_source(url, num_links, sleep, visit_id, webdriver,
                           proxy_queue, browser_params, manager_params,
                           extension_sockets):
    """Calls get_website before visiting <num_links> present on the page.

    Each link visited will do a recursive page source dump.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params,
                extension_sockets)
    recursive_dump_page_source(visit_id, webdriver, manager_params, suffix='0')

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    already_clicked = set()
    for i in range(num_links):
        all_links = get_intra_links(webdriver, url)
        disp_links = filter(lambda x: is_displayed(x), all_links)
        links = filter(lambda x: _filter_out_clicks(x, already_clicked),
                       disp_links)
        if len(links) == 0:
            break
        random.shuffle(links)
        clicked = False
        for link in links:
            try:
                href = link.get_attribute('href')
                already_clicked.add(href)
                logger.info("BROWSER %i: Trying to click %s out of "
                            "%i links" %
                            (browser_params['crawl_id'], href, len(links)))
                link.click()
            except ElementNotVisibleException:
                continue
            except WebDriverException:
                continue
            except Exception, e:
                logger.error("BROWSER %i: Exception trying to visit %s, %s" %
                             (browser_params['crawl_id'],
                              link.get_attribute("href"), str(e)))
                continue
            logger.info("BROWSER %i: visiting internal link %s" %
                        (browser_params['crawl_id'], href))
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            recursive_dump_page_source(visit_id,
                                       webdriver,
                                       manager_params,
                                       suffix=str(i + 1))
            webdriver.back()
            time.sleep(max(1, sleep))
            wait_until_loaded(webdriver, 300)
            clicked = True
            break
        if not clicked:
            break