Esempio n. 1
0
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket):
    """
    executes BrowserManager commands by passing command tuples into necessary helper function
    commands are of form (COMMAND, ARG0, ARG1, ...)
    the only imports in this file should be imports to helper libraries
    """
    if command[0] == 'GET':
        browser_commands.get_website(url=command[1], sleep=command[2], scroll=command[3], visit_id=command[4],
                                     webdriver=webdriver, proxy_queue=proxy_queue,
                                     browser_params=browser_params, extension_socket=extension_socket)

    if command[0] == 'BROWSE':
        browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3],
                                        visit_id=command[4], webdriver=webdriver,
                                        proxy_queue=proxy_queue, browser_params=browser_params,
                                        manager_params=manager_params, extension_socket=extension_socket)

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2],
                                            webdriver=webdriver, browser_params=browser_params,
                                            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2],
                                              webdriver=webdriver, browser_params=browser_params,
                                              manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'],
                                      manager_params=manager_params,
                                      browser_params=browser_params,
                                      tar_location=command[1], close_webdriver=command[2],
                                      webdriver=webdriver, browser_settings=browser_settings,
                                      compress=command[3],
                                      save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'EXTRACT_LINKS':
        browser_commands.extract_links(webdriver, browser_params, manager_params)

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
                                         browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'EXTRACT_ELEMENTS':
        browser_commands.extract_elements(selector=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)
    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "proxy_queue": proxy_queue,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_socket": extension_socket}
        command[1](*command[2], **arg_dict)
Esempio n. 2
0
def _form_fill_and_submit(form, user_info, webdriver, clear, browser_params,
                          manager_params, screenshot_filename):
    """Fills out a form and submits it, then waits for the response."""
    # try to fill all input fields in the form...
    input_fields = form.find_elements_by_tag_name('input')
    submit_button = None
    text_field = None
    for input_field in input_fields:
        if not input_field.is_displayed():
            continue

        type = input_field.get_attribute('type').lower()
        if type == 'email':
            # using html5 "email" type, this is probably an email field
            _type_in_field(input_field, user_info['email'], clear)
            text_field = input_field
        elif type == 'text':
            # try to decipher this based on field attributes
            if _element_contains_text(input_field, 'company'):
                _type_in_field(input_field, user_info['company'], clear)
            elif _element_contains_text(input_field, 'title'):
                _type_in_field(input_field, user_info['title'], clear)
            elif _element_contains_text(input_field, 'name'):
                if _element_contains_text(input_field,
                                          ['first', 'forename', 'fname']):
                    _type_in_field(input_field, user_info['first_name'], clear)
                elif _element_contains_text(input_field,
                                            ['last', 'surname', 'lname']):
                    _type_in_field(input_field, user_info['last_name'], clear)
                elif _element_contains_text(input_field, ['user', 'account']):
                    _type_in_field(input_field, user_info['user'], clear)
                else:
                    _type_in_field(input_field, user_info['full_name'], clear)
            elif _element_contains_text(input_field, ['zip', 'postal']):
                _type_in_field(input_field, user_info['zip'], clear)
            elif _element_contains_text(input_field, 'city'):
                _type_in_field(input_field, user_info['city'], clear)
            elif _element_contains_text(input_field, 'state'):
                _type_in_field(input_field, user_info['state'], clear)
            elif _element_contains_text(input_field, _KEYWORDS_EMAIL):
                _type_in_field(input_field, user_info['email'], clear)
            elif _element_contains_text(input_field, ['street', 'address']):
                if _element_contains_text(input_field, ['2', 'number']):
                    _type_in_field(input_field, user_info['street2'], clear)
                elif _element_contains_text(input_field, '3'):
                    pass
                else:
                    _type_in_field(input_field, user_info['street1'], clear)
            elif _element_contains_text(input_field,
                                        ['phone', 'tel', 'mobile']):
                _type_in_field(input_field, user_info['tel'], clear)
            elif _element_contains_text(input_field, 'search'):
                pass
            else:
                # skip if visibly marked "optional"
                placeholder = input_field.get_attribute('placeholder')
                if placeholder is not None and 'optional' in placeholder.lower(
                ):
                    pass

                # default: assume email
                else:
                    _type_in_field(input_field, user_info['email'], clear)
            text_field = input_field
        elif type == 'number':
            if _element_contains_text(input_field, ['phone', 'tel', 'mobile']):
                _type_in_field(input_field, user_info['tel'], clear)
            elif _element_contains_text(input_field, ['zip', 'postal']):
                _type_in_field(input_field, user_info['zip'], clear)
            else:
                _type_in_field(input_field, user_info['zip'], clear)
        elif type == 'checkbox' or type == 'radio':
            # check anything/everything
            if not input_field.is_selected():
                input_field.click()
        elif type == 'password':
            _type_in_field(input_field, user_info['password'], clear)
        elif type == 'tel':
            _type_in_field(input_field, user_info['tel'], clear)
        elif type == 'submit' or type == 'button' or type == 'image':
            if _element_contains_text(input_field, _KEYWORDS_SUBMIT):
                submit_button = input_field
        elif type == 'reset' or type == 'hidden' or type == 'search':
            # common irrelevant input types
            pass
        else:
            # default: assume email
            _type_in_field(input_field, user_info['email'], clear)

    # find 'button' tags (if necessary)
    if submit_button is None:
        buttons = form.find_elements_by_tag_name('button')
        for button in buttons:
            if not button.is_displayed():
                continue

            # filter out non-submit button types
            type = button.get_attribute('type').lower()
            if type is not None and (type == 'reset' or type == 'menu'):
                continue

            # pick first matching button
            if _element_contains_text(button, _KEYWORDS_SUBMIT):
                submit_button = button
                break

    # fill in 'select' fields
    select_fields = form.find_elements_by_tag_name('select')
    for select_field in select_fields:
        if not select_field.is_displayed():
            continue

        # select an appropriate element if possible,
        # otherwise second element (to skip blank fields),
        # falling back on the first
        select = Select(select_field)
        select_options = select.options
        selected_index = None
        for i, opt in enumerate(select_options):
            opt_text = opt.text.strip().lower()
            if opt_text in _KEYWORDS_SELECT:
                selected_index = i
                break
        if selected_index is None:
            selected_index = min(1, len(select_options) - 1)
        select.select_by_index(selected_index)

    # debug: save screenshot
    if screenshot_filename:
        save_screenshot(screenshot_filename, webdriver, browser_params,
                        manager_params)

    # submit the form
    if submit_button is not None:
        try:
            submit_button.click()  # trigger javascript events if possible
            return
        except:
            pass
    if text_field is not None:
        try:
            text_field.send_keys(Keys.RETURN)  # press enter
        except:
            pass
    try:
        if form.tag_name.lower() == 'form':
            form.submit()  # submit() form
    except:
        pass
Esempio n. 3
0
def _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                        browser_params, manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    # try to find newsletter form on landing page
    newsletter_form = _find_newsletter_form(webdriver)
    if newsletter_form is None:
        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name('iframe')
        for iframe in iframes:
            # switch to the iframe
            webdriver.switch_to_frame(iframe)

            # is there a form?
            newsletter_form = _find_newsletter_form(webdriver)
            if newsletter_form is not None:
                if debug:
                    dump_page_source(debug_page_source_initial, webdriver,
                                     browser_params, manager_params)
                in_iframe = True
                break  # form found, stay on the iframe

            # switch back
            webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = email_producer(current_url, current_site_title)
    user_info = _get_user_info(email)
    _form_fill_and_submit(newsletter_form, user_info, webdriver, False,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('submitted form on [%s] with email [%s]', current_url, email)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)
    if debug:
        time.sleep(3)
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(webdriver)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)
                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            time.sleep(3)
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        follow_up_form = _find_newsletter_form(webdriver)
        if follow_up_form is not None:
            if debug:
                time.sleep(3)
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)
            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                time.sleep(3)
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)

# switch back
    if in_iframe:
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
def execute_command(command, webdriver, proxy_queue, browser_settings,
                    browser_params, manager_params, extension_sockets):
    """
    executes BrowserManager commands by passing command tuples into necessary helper function
    commands are of form (COMMAND, ARG0, ARG1, ...)
    the only imports in this file should be imports to helper libraries
    """
    if command[0] == 'GET':
        browser_commands.get_website(
            url=command[1], sleep=command[2], visit_id=command[3],
            webdriver=webdriver, proxy_queue=proxy_queue,
            browser_params=browser_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'BROWSE':
        browser_commands.browse_website(
            url=command[1], num_links=command[2], sleep=command[3],
            visit_id=command[4], webdriver=webdriver,
            proxy_queue=proxy_queue, browser_params=browser_params,
            manager_params=manager_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2],
                                            webdriver=webdriver, browser_params=browser_params,
                                            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2],
                                              webdriver=webdriver, browser_params=browser_params,
                                              manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'],
                                      manager_params=manager_params,
                                      browser_params=browser_params,
                                      tar_location=command[1], close_webdriver=command[2],
                                      webdriver=webdriver, browser_settings=browser_settings,
                                      compress=command[3],
                                      save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'EXTRACT_LINKS':
        browser_commands.extract_links(webdriver, browser_params, manager_params)

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
                                         browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'BROWSE_AND_DUMP_SOURCE':
        browser_commands.browse_and_dump_source(
            url=command[1],
            num_links=command[2],
            sleep=command[3],
            visit_id=command[4],
            webdriver=webdriver,
            proxy_queue=proxy_queue,
            browser_params=browser_params,
            manager_params=manager_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE':
        browser_commands.recursive_dump_page_source(
            visit_id=command[2],
            driver=webdriver,
            manager_params=manager_params,
            suffix=command[1]
        )

    if command[0] == 'FACEBOOK_LOGIN':
        facebook_commands.facebook_login(
            driver=webdriver,
            url=command[1],
            visit_id=command[2],
            manager_params=manager_params,
            browser_params=browser_params
        )

    if command[0] == 'REQUEST_FILTER':
        browser_commands.request_filter(
            control_message=command[1],
            filter_name=command[2],
            crawl_id=browser_params['crawl_id'],
            extension_sockets=extension_sockets,
            manager_params=manager_params
        )

    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "proxy_queue": proxy_queue,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_sockets": extension_sockets}
        command[1](*command[2], visit_id=command[3], **arg_dict)
def execute_command(command, webdriver, browser_settings, browser_params,
                    manager_params, extension_socket):
    """Executes BrowserManager commands
    commands are of form (COMMAND, ARG0, ARG1, ...)
    """
    if command[0] == 'GET':
        browser_commands.get_website(
            url=command[1], sleep=command[2], visit_id=command[3],
            webdriver=webdriver, browser_params=browser_params,
            extension_socket=extension_socket)

    if command[0] == 'BROWSE':
        browser_commands.browse_website(
            url=command[1], num_links=command[2], sleep=command[3],
            visit_id=command[4], webdriver=webdriver,
            browser_params=browser_params, manager_params=manager_params,
            extension_socket=extension_socket)

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(
            start_time=command[1], visit_id=command[2],
            webdriver=webdriver, browser_params=browser_params,
            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(
            start_time=command[1], visit_id=command[2],
            webdriver=webdriver, browser_params=browser_params,
            manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(
            browser_profile_folder=browser_params['profile_path'],
            manager_params=manager_params,
            browser_params=browser_params,
            tar_location=command[1], close_webdriver=command[2],
            webdriver=webdriver, browser_settings=browser_settings,
            compress=command[3],
            save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(
            visit_id=command[2], driver=webdriver,
            manager_params=manager_params, suffix=command[1])

    if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE':
        browser_commands.recursive_dump_page_source(
            visit_id=command[2], driver=webdriver,
            manager_params=manager_params, suffix=command[1])

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(
            visit_id=command[2], crawl_id=browser_params['crawl_id'],
            driver=webdriver, manager_params=manager_params, suffix=command[1])

    if command[0] == 'SCREENSHOT_FULL_PAGE':
        browser_commands.screenshot_full_page(
            visit_id=command[2], crawl_id=browser_params['crawl_id'],
            driver=webdriver, manager_params=manager_params, suffix=command[1])

    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_socket": extension_socket}
        command[1](*command[2], **arg_dict)
def _find_and_fill_form(webdriver, user_data, visit_id, debug, browser_params,
                        manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    if debug: logger.debug('The current URL is %s' % current_url)

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    newsletter_form = None

    # Search for a modal dialog, and for a form in the modal dialog
    # Search for no more than two modal dialogs
    try:
        search_count = 0
        while (search_count < _MAX_POPUP_DISMISS):
            if debug:
                logger.debug('Round %d of modal dialog search...' %
                             search_count)
            dialog_container = _get_dialog_container(webdriver)
            if dialog_container:
                if debug:
                    logger.debug(
                        'Modal dialog found, searching for newsletter form in dialog...'
                    )
                newsletter_form = _find_newsletter_form(
                    dialog_container, webdriver, debug, logger)

                if newsletter_form is None:
                    clicked = _dismiss_dialog(webdriver, dialog_container)
                    if debug:
                        if int(clicked) > 0:
                            if debug:
                                logger.debug(
                                    'No newsletter form in dialog, dismissed it'
                                )
                        else:
                            if debug:
                                logger.debug(
                                    'Made no clicks to dismiss the dialog')
                                webdriver.find_element_by_tag_name(
                                    'html').send_keys(Keys.ESCAPE)
                                logger.debug(
                                    'Pressed ESC to dismiss the dialog')
                else:
                    if debug:
                        logger.debug('Found a newsletter form in the dialog')
                    break
            else:
                if debug: logger.debug('No dialog on the page')
                break

            search_count += 1
    except Exception as e:
        logger.error('Error while examining for modal dialogs: %s' % str(e))

    # try to find newsletter forms on landing page after dismissing the dialog
    if newsletter_form is None:
        if debug:
            logger.debug(
                'Searching the rest of the page for a newsletter form')
        newsletter_form = _find_newsletter_form(None, webdriver, debug, logger)

    # Search for newsletter forms in iframes
    if newsletter_form is None:
        if debug:
            logger.debug(
                'No newsletter form found on this page, searching for forms in iframes...'
            )

        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name(
            'iframe') + webdriver.find_elements_by_tag_name('frame')
        if debug: logger.debug('Searching in %d iframes' % len(iframes))

        for iframe in iframes:
            try:
                # switch to the iframe
                webdriver.switch_to_frame(iframe)

                # is there a form?
                newsletter_form = _find_newsletter_form(
                    None, webdriver, debug, logger)
                if newsletter_form is not None:
                    if debug:
                        dump_page_source(debug_page_source_initial, webdriver,
                                         browser_params, manager_params)
                        logger.debug(
                            'Found a newsletter in an iframe on this page')
                    in_iframe = True
                    break  # form found, stay on the iframe

                # switch back
                webdriver.switch_to_default_content()
            except Exception as e:
                if debug:
                    logger.error('Error while analyzing an iframe: %s' %
                                 str(e))
                webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            if debug: logger.debug('None of the iframes have newsletter forms')
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = user_data['email']
    user_info = user_data
    _form_fill_and_submit(newsletter_form, user_info, webdriver, True,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('Submitted form on [%s] with email [%s] on visit_id [%d]',
                current_url, email, visit_id)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)

    if debug:
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)
        logger.debug('The current URL is %s' % webdriver.current_url)
        logger.debug('Filling any follow-up forms on this page...')

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Found %d windows (e.g., popups)' % len(windows))
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(
                        None, webdriver, debug, logger)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                            logger.debug(
                                'Found a newsletter form in another window')
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)

                        logger.info(
                            'Submitted form on [%s] with email [%s] on visit_id [%d]',
                            webdriver.current_url, email, visit_id)

                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        if debug:
            logger.debug(
                'Found no follow-up forms in other windows, checking current page'
            )
        follow_up_form = _find_newsletter_form(None, webdriver, debug, logger)
        if follow_up_form is not None:
            if debug:
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
                logger.debug('Found a follow-up form in this page')

            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)

            logger.info(
                'Submitted form on [%s] with email [%s] on visit_id [%d]',
                webdriver.current_url, email, visit_id)

            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)
        else:
            if debug: logger.debug('No follow-up forms on the current page')

# switch back
    if in_iframe:
        if debug:
            logger.debug(
                'We were in an iframe, switching back to the main window')
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Closing %d windows (e.g., popups)' % len(windows))
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
def fill_forms(url, user_data, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket):
    """Finds a newsletter form on the page. If not found, visits <num_links>
    internal links and scans those pages for a form. Submits the form if found.
    """
    # load the site
    webdriver.set_page_load_timeout(page_timeout)
    get_website(url, 0, visit_id, webdriver, proxy_queue, browser_params,
                extension_socket)

    # connect to the logger
    logger = loggingclient(*manager_params['logger_address'])

    # sleep before proceeding, let popups (if any, appear)
    time.sleep(_PAGE_LOAD_TIME)

    # take a screenshot, and try to find a newsletter form on the landing page
    if debug:
        save_screenshot(
            str(visit_id) + '_landing_page', webdriver, browser_params,
            manager_params)

    if _find_and_fill_form(webdriver, user_data, visit_id, debug,
                           browser_params, manager_params, logger):
        if debug: logger.debug('Done searching and submitting forms, exiting')
        return

    if debug:
        logger.debug(
            'Could not find and submit a newsletter form on the landing page; scanning more pages..'
        )

    # otherwise, scan more pages
    main_handle = webdriver.current_window_handle
    visited_links = set()
    for i in xrange(num_links):
        # get all links on the page
        links = webdriver.find_elements_by_tag_name('a')
        random.shuffle(links)

        current_url = webdriver.current_url
        current_ps1 = domain_utils.get_ps_plus_1(current_url)

        # find links to click
        match_links = []
        start_time = timeit.default_timer()
        for link in links:
            try:
                if not link.is_displayed():
                    continue

                # check if link is valid and not already visited
                href = link.get_attribute('href')
                if href is None or href in visited_links:
                    continue

                # check if this is an internal link
                if not _is_internal_link(
                        href, current_url,
                        current_ps1) and not _whitelisted_links(
                            href, current_url):
                    continue

                link_text = link.get_attribute('text').lower().strip()

                # skip links with blacklisted text
                blacklisted = False
                for bl_text in _LINK_TEXT_BLACKLIST:
                    if bl_text in link_text:
                        blacklisted = True
                        break
                if blacklisted:
                    continue

                # should we click this link?
                link_rank = 0
                for type, s, rank, flags in _LINK_TEXT_RANK:
                    if (type == _TYPE_TEXT
                            and s in link_text) or (type == _TYPE_HREF
                                                    and s in href):
                        if flags & _FLAG_IN_NEW_URL_ONLY:
                            # don't use this link if the current page URL already matches too
                            if type == _TYPE_HREF and s in current_url:
                                continue

                        # link matches!
                        link_rank = rank
                        match_links.append(
                            (link, rank, link_text, href, flags))
                        break
                if link_rank >= _LINK_RANK_SKIP:  # good enough, stop looking
                    break
            except:
                logger.error('Error while looping through links...')

            # quit if too much time passed (for some reason, this is really slow...)
            if match_links and timeit.default_timer(
            ) - start_time > _LINK_MATCH_TIMEOUT:
                logger.warning('Too much time passed, quiting')
                break

        # find the best link to click
        if not match_links:
            if debug: logger.debug('No more links to click')
            break  # no more links to click
        match_links.sort(key=lambda l: l[1])
        next_link = match_links[-1]
        visited_links.add(next_link[3])

        # click the link
        try:
            # load the page
            logger.info("Clicking on link '%s' - %s" %
                        (next_link[2], next_link[3]))
            next_link[0].click()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)
            if browser_params['bot_mitigation']:
                bot_mitigation(webdriver)

            # find newsletter form
            if _find_and_fill_form(webdriver, user_data, visit_id, debug,
                                   browser_params, manager_params, logger):
                if debug:
                    logger.debug(
                        'Found and submitted newsletter form on this page')
                return

            # should we stay on this page?
            if next_link[4] & _FLAG_STAY_ON_PAGE:
                continue

            # go back
            webdriver.back()
            if debug: logger.debug('Pressing the back button')
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

            # check other windows (ex. pop-ups)
            windows = webdriver.window_handles
            if len(windows) > 1:
                form_found_in_popup = False
                for window in windows:
                    if window != main_handle:
                        webdriver.switch_to_window(window)
                        wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

                        # find newsletter form
                        if _find_and_fill_form(webdriver, user_data, visit_id,
                                               debug, browser_params,
                                               manager_params, logger):
                            if debug:
                                logger.debug(
                                    'Found and submitted newsletter form in a popup on this page'
                                )
                            form_found_in_popup = True

                        webdriver.close()

                if form_found_in_popup:
                    return

                webdriver.switch_to_window(main_handle)
                time.sleep(1)

        except:
            pass

    if debug: logger.debug('Failed to find and submit a newsletter form')