Esempio n. 1
0
def browse_website(url, num_links, webdriver, proxy_queue, browser_params, manager_params, extension_socket):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))
        
        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Esempio n. 2
0
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
                   browser_params, manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        logger.info("BROWSER %i: visiting internal link %s" % (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1,sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params)

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random()*len(links)-1)
        print "BROWSE: visiting link to %s" % links[r].get_attribute("href")
        
        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
def browse_website(url, num_links, webdriver, proxy_queue, browser_params):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params)

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links) - 1)
        print "BROWSE: visiting link to %s" % links[r].get_attribute("href")

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Esempio n. 5
0
def browse_website(url, num_links, webdriver, proxy_queue, browser_params,
                   manager_params, extension_socket):
    """
    calls get_website before visiting <num_links> present on the page
    NOTE: top_url will NOT be properly labeled for requests to subpages
          these will still have the top_url set to the url passed as a parameter
          to this function.
    """
    # First get the site
    get_website(url, webdriver, proxy_queue, browser_params, extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links) - 1)
        logger.info(
            "BROWSER %i: visiting internal link %s" %
            (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(1)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
        except Exception, e:
            pass
Esempio n. 6
0
def browse_website(url, num_links, sleep, visit_id, webdriver, proxy_queue,
                   browser_params, manager_params, extension_socket):
    """Calls get_website before visiting <num_links> present on the page.

    Note: the site_url in the site_visits table for the links visited will
    be the site_url of the original page and NOT the url of the links visited.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params,
                extension_socket)

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    for i in range(num_links):
        links = get_intra_links(webdriver, url)
        links = filter(lambda x: x.is_displayed() == True, links)
        if len(links) == 0:
            break
        r = int(random.random() * len(links))
        logger.info(
            "BROWSER %i: visiting internal link %s" %
            (browser_params['crawl_id'], links[r].get_attribute("href")))

        try:
            links[r].click()
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)
            webdriver.back()
            wait_until_loaded(webdriver, 300)
        except Exception:
            pass
Esempio n. 7
0
def browse_and_dump_source(url, num_links, sleep, visit_id, webdriver,
                           proxy_queue, browser_params, manager_params,
                           extension_sockets):
    """Calls get_website before visiting <num_links> present on the page.

    Each link visited will do a recursive page source dump.
    """
    # First get the site
    get_website(url, sleep, visit_id, webdriver, proxy_queue, browser_params,
                extension_sockets)
    recursive_dump_page_source(visit_id, webdriver, manager_params, suffix='0')

    # Connect to logger
    logger = loggingclient(*manager_params['logger_address'])

    # Then visit a few subpages
    already_clicked = set()
    for i in range(num_links):
        all_links = get_intra_links(webdriver, url)
        disp_links = filter(lambda x: is_displayed(x), all_links)
        links = filter(lambda x: _filter_out_clicks(x, already_clicked),
                       disp_links)
        if len(links) == 0:
            break
        random.shuffle(links)
        clicked = False
        for link in links:
            try:
                href = link.get_attribute('href')
                already_clicked.add(href)
                logger.info("BROWSER %i: Trying to click %s out of "
                            "%i links" %
                            (browser_params['crawl_id'], href, len(links)))
                link.click()
            except ElementNotVisibleException:
                continue
            except WebDriverException:
                continue
            except Exception, e:
                logger.error("BROWSER %i: Exception trying to visit %s, %s" %
                             (browser_params['crawl_id'],
                              link.get_attribute("href"), str(e)))
                continue
            logger.info("BROWSER %i: visiting internal link %s" %
                        (browser_params['crawl_id'], href))
            wait_until_loaded(webdriver, 300)
            time.sleep(max(1, sleep))
            recursive_dump_page_source(visit_id,
                                       webdriver,
                                       manager_params,
                                       suffix=str(i + 1))
            webdriver.back()
            time.sleep(max(1, sleep))
            wait_until_loaded(webdriver, 300)
            clicked = True
            break
        if not clicked:
            break
Esempio n. 8
0
def _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                        browser_params, manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    # try to find newsletter form on landing page
    newsletter_form = _find_newsletter_form(webdriver)
    if newsletter_form is None:
        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name('iframe')
        for iframe in iframes:
            # switch to the iframe
            webdriver.switch_to_frame(iframe)

            # is there a form?
            newsletter_form = _find_newsletter_form(webdriver)
            if newsletter_form is not None:
                if debug:
                    dump_page_source(debug_page_source_initial, webdriver,
                                     browser_params, manager_params)
                in_iframe = True
                break  # form found, stay on the iframe

            # switch back
            webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = email_producer(current_url, current_site_title)
    user_info = _get_user_info(email)
    _form_fill_and_submit(newsletter_form, user_info, webdriver, False,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('submitted form on [%s] with email [%s]', current_url, email)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)
    if debug:
        time.sleep(3)
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(webdriver)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)
                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            time.sleep(3)
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        follow_up_form = _find_newsletter_form(webdriver)
        if follow_up_form is not None:
            if debug:
                time.sleep(3)
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)
            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                time.sleep(3)
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)

# switch back
    if in_iframe:
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
Esempio n. 9
0
def fill_forms(url, email_producer, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket):
    """Finds a newsletter form on the page. If not found, visits <num_links>
    internal links and scans those pages for a form. Submits the form if found.
    """
    # load the site
    webdriver.set_page_load_timeout(page_timeout)
    get_website(url, 0, visit_id, webdriver, proxy_queue, browser_params,
                extension_socket)

    # connect to the logger
    logger = loggingclient(*manager_params['logger_address'])

    # try to find a newsletter form on the landing page
    if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                           browser_params, manager_params, logger):
        return

    # otherwise, scan more pages
    main_handle = webdriver.current_window_handle
    visited_links = set()
    for i in xrange(num_links):
        # get all links on the page
        links = webdriver.find_elements_by_tag_name('a')
        random.shuffle(links)

        current_url = webdriver.current_url
        current_ps1 = domain_utils.get_ps_plus_1(current_url)

        # find links to click
        match_links = []
        start_time = timeit.default_timer()
        for link in links:
            try:
                if not link.is_displayed():
                    continue

                # check if link is valid and not already visited
                href = link.get_attribute('href')
                if href is None or href in visited_links:
                    continue

                # check if this is an internal link
                if not _is_internal_link(href, current_url, current_ps1):
                    continue

                link_text = link.text.lower()

                # skip links with blacklisted text
                blacklisted = False
                for bl_text in _LINK_TEXT_BLACKLIST:
                    if bl_text in link_text:
                        blacklisted = True
                        break
                if blacklisted:
                    continue

                # should we click this link?
                link_rank = 0
                for type, s, rank, flags in _LINK_TEXT_RANK:
                    if (type == _TYPE_TEXT
                            and s in link_text) or (type == _TYPE_HREF
                                                    and s in href):
                        if flags & _FLAG_IN_NEW_URL_ONLY:
                            # don't use this link if the current page URL already matches too
                            if type == _TYPE_HREF and s in current_url:
                                continue

                        # link matches!
                        link_rank = rank
                        match_links.append(
                            (link, rank, link_text, href, flags))
                        break
                if link_rank >= _LINK_RANK_SKIP:  # good enough, stop looking
                    break
            except:
                logger.error("error while looping through links...")

            # quit if too much time passed (for some reason, this is really slow...)
            if match_links and timeit.default_timer(
            ) - start_time > _LINK_MATCH_TIMEOUT:
                break

        # find the best link to click
        if not match_links:
            break  # no more links to click
        match_links.sort(key=lambda l: l[1])
        next_link = match_links[-1]
        visited_links.add(next_link[3])

        # click the link
        try:
            # load the page
            logger.info("clicking on link '%s' - %s" %
                        (next_link[2], next_link[3]))
            next_link[0].click()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)

            # find newsletter form
            if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                                   browser_params, manager_params, logger):
                return

            # should we stay on this page?
            if next_link[4] & _FLAG_STAY_ON_PAGE:
                continue

            # go back
            webdriver.back()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

            # check other windows (ex. pop-ups)
            windows = webdriver.window_handles
            if len(windows) > 1:
                form_found_in_popup = False
                for window in windows:
                    if window != main_handle:
                        webdriver.switch_to_window(window)
                        wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

                        # find newsletter form
                        if _find_and_fill_form(webdriver, email_producer,
                                               visit_id, debug, browser_params,
                                               manager_params, logger):
                            form_found_in_popup = True

                        webdriver.close()
                webdriver.switch_to_window(main_handle)
                time.sleep(1)

                if form_found_in_popup:
                    return
        except:
            pass
def _find_and_fill_form(webdriver, user_data, visit_id, debug, browser_params,
                        manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    if debug: logger.debug('The current URL is %s' % current_url)

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    newsletter_form = None

    # Search for a modal dialog, and for a form in the modal dialog
    # Search for no more than two modal dialogs
    try:
        search_count = 0
        while (search_count < _MAX_POPUP_DISMISS):
            if debug:
                logger.debug('Round %d of modal dialog search...' %
                             search_count)
            dialog_container = _get_dialog_container(webdriver)
            if dialog_container:
                if debug:
                    logger.debug(
                        'Modal dialog found, searching for newsletter form in dialog...'
                    )
                newsletter_form = _find_newsletter_form(
                    dialog_container, webdriver, debug, logger)

                if newsletter_form is None:
                    clicked = _dismiss_dialog(webdriver, dialog_container)
                    if debug:
                        if int(clicked) > 0:
                            if debug:
                                logger.debug(
                                    'No newsletter form in dialog, dismissed it'
                                )
                        else:
                            if debug:
                                logger.debug(
                                    'Made no clicks to dismiss the dialog')
                                webdriver.find_element_by_tag_name(
                                    'html').send_keys(Keys.ESCAPE)
                                logger.debug(
                                    'Pressed ESC to dismiss the dialog')
                else:
                    if debug:
                        logger.debug('Found a newsletter form in the dialog')
                    break
            else:
                if debug: logger.debug('No dialog on the page')
                break

            search_count += 1
    except Exception as e:
        logger.error('Error while examining for modal dialogs: %s' % str(e))

    # try to find newsletter forms on landing page after dismissing the dialog
    if newsletter_form is None:
        if debug:
            logger.debug(
                'Searching the rest of the page for a newsletter form')
        newsletter_form = _find_newsletter_form(None, webdriver, debug, logger)

    # Search for newsletter forms in iframes
    if newsletter_form is None:
        if debug:
            logger.debug(
                'No newsletter form found on this page, searching for forms in iframes...'
            )

        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name(
            'iframe') + webdriver.find_elements_by_tag_name('frame')
        if debug: logger.debug('Searching in %d iframes' % len(iframes))

        for iframe in iframes:
            try:
                # switch to the iframe
                webdriver.switch_to_frame(iframe)

                # is there a form?
                newsletter_form = _find_newsletter_form(
                    None, webdriver, debug, logger)
                if newsletter_form is not None:
                    if debug:
                        dump_page_source(debug_page_source_initial, webdriver,
                                         browser_params, manager_params)
                        logger.debug(
                            'Found a newsletter in an iframe on this page')
                    in_iframe = True
                    break  # form found, stay on the iframe

                # switch back
                webdriver.switch_to_default_content()
            except Exception as e:
                if debug:
                    logger.error('Error while analyzing an iframe: %s' %
                                 str(e))
                webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            if debug: logger.debug('None of the iframes have newsletter forms')
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = user_data['email']
    user_info = user_data
    _form_fill_and_submit(newsletter_form, user_info, webdriver, True,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('Submitted form on [%s] with email [%s] on visit_id [%d]',
                current_url, email, visit_id)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)

    if debug:
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)
        logger.debug('The current URL is %s' % webdriver.current_url)
        logger.debug('Filling any follow-up forms on this page...')

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Found %d windows (e.g., popups)' % len(windows))
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(
                        None, webdriver, debug, logger)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                            logger.debug(
                                'Found a newsletter form in another window')
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)

                        logger.info(
                            'Submitted form on [%s] with email [%s] on visit_id [%d]',
                            webdriver.current_url, email, visit_id)

                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        if debug:
            logger.debug(
                'Found no follow-up forms in other windows, checking current page'
            )
        follow_up_form = _find_newsletter_form(None, webdriver, debug, logger)
        if follow_up_form is not None:
            if debug:
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
                logger.debug('Found a follow-up form in this page')

            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)

            logger.info(
                'Submitted form on [%s] with email [%s] on visit_id [%d]',
                webdriver.current_url, email, visit_id)

            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)
        else:
            if debug: logger.debug('No follow-up forms on the current page')

# switch back
    if in_iframe:
        if debug:
            logger.debug(
                'We were in an iframe, switching back to the main window')
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Closing %d windows (e.g., popups)' % len(windows))
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True