Ejemplo n.º 1
0
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket):
    """
    executes BrowserManager commands by passing command tuples into necessary helper function
    commands are of form (COMMAND, ARG0, ARG1, ...)
    the only imports in this file should be imports to helper libraries
    """
    if command[0] == 'GET':
        browser_commands.get_website(url=command[1], sleep=command[2], scroll=command[3], visit_id=command[4],
                                     webdriver=webdriver, proxy_queue=proxy_queue,
                                     browser_params=browser_params, extension_socket=extension_socket)

    if command[0] == 'BROWSE':
        browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3],
                                        visit_id=command[4], webdriver=webdriver,
                                        proxy_queue=proxy_queue, browser_params=browser_params,
                                        manager_params=manager_params, extension_socket=extension_socket)

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2],
                                            webdriver=webdriver, browser_params=browser_params,
                                            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2],
                                              webdriver=webdriver, browser_params=browser_params,
                                              manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'],
                                      manager_params=manager_params,
                                      browser_params=browser_params,
                                      tar_location=command[1], close_webdriver=command[2],
                                      webdriver=webdriver, browser_settings=browser_settings,
                                      compress=command[3],
                                      save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'EXTRACT_LINKS':
        browser_commands.extract_links(webdriver, browser_params, manager_params)

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
                                         browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'EXTRACT_ELEMENTS':
        browser_commands.extract_elements(selector=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)
    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "proxy_queue": proxy_queue,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_socket": extension_socket}
        command[1](*command[2], **arg_dict)
Ejemplo n.º 2
0
def _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                        browser_params, manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    # try to find newsletter form on landing page
    newsletter_form = _find_newsletter_form(webdriver)
    if newsletter_form is None:
        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name('iframe')
        for iframe in iframes:
            # switch to the iframe
            webdriver.switch_to_frame(iframe)

            # is there a form?
            newsletter_form = _find_newsletter_form(webdriver)
            if newsletter_form is not None:
                if debug:
                    dump_page_source(debug_page_source_initial, webdriver,
                                     browser_params, manager_params)
                in_iframe = True
                break  # form found, stay on the iframe

            # switch back
            webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = email_producer(current_url, current_site_title)
    user_info = _get_user_info(email)
    _form_fill_and_submit(newsletter_form, user_info, webdriver, False,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('submitted form on [%s] with email [%s]', current_url, email)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)
    if debug:
        time.sleep(3)
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(webdriver)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)
                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            time.sleep(3)
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        follow_up_form = _find_newsletter_form(webdriver)
        if follow_up_form is not None:
            if debug:
                time.sleep(3)
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)
            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                time.sleep(3)
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)

# switch back
    if in_iframe:
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True
def execute_command(command, webdriver, proxy_queue, browser_settings,
                    browser_params, manager_params, extension_sockets):
    """
    executes BrowserManager commands by passing command tuples into necessary helper function
    commands are of form (COMMAND, ARG0, ARG1, ...)
    the only imports in this file should be imports to helper libraries
    """
    if command[0] == 'GET':
        browser_commands.get_website(
            url=command[1], sleep=command[2], visit_id=command[3],
            webdriver=webdriver, proxy_queue=proxy_queue,
            browser_params=browser_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'BROWSE':
        browser_commands.browse_website(
            url=command[1], num_links=command[2], sleep=command[3],
            visit_id=command[4], webdriver=webdriver,
            proxy_queue=proxy_queue, browser_params=browser_params,
            manager_params=manager_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2],
                                            webdriver=webdriver, browser_params=browser_params,
                                            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2],
                                              webdriver=webdriver, browser_params=browser_params,
                                              manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'],
                                      manager_params=manager_params,
                                      browser_params=browser_params,
                                      tar_location=command[1], close_webdriver=command[2],
                                      webdriver=webdriver, browser_settings=browser_settings,
                                      compress=command[3],
                                      save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'EXTRACT_LINKS':
        browser_commands.extract_links(webdriver, browser_params, manager_params)

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver,
                                         browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver,
                                          browser_params=browser_params, manager_params=manager_params)

    if command[0] == 'BROWSE_AND_DUMP_SOURCE':
        browser_commands.browse_and_dump_source(
            url=command[1],
            num_links=command[2],
            sleep=command[3],
            visit_id=command[4],
            webdriver=webdriver,
            proxy_queue=proxy_queue,
            browser_params=browser_params,
            manager_params=manager_params,
            extension_sockets=extension_sockets
        )

    if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE':
        browser_commands.recursive_dump_page_source(
            visit_id=command[2],
            driver=webdriver,
            manager_params=manager_params,
            suffix=command[1]
        )

    if command[0] == 'FACEBOOK_LOGIN':
        facebook_commands.facebook_login(
            driver=webdriver,
            url=command[1],
            visit_id=command[2],
            manager_params=manager_params,
            browser_params=browser_params
        )

    if command[0] == 'REQUEST_FILTER':
        browser_commands.request_filter(
            control_message=command[1],
            filter_name=command[2],
            crawl_id=browser_params['crawl_id'],
            extension_sockets=extension_sockets,
            manager_params=manager_params
        )

    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "proxy_queue": proxy_queue,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_sockets": extension_sockets}
        command[1](*command[2], visit_id=command[3], **arg_dict)
Ejemplo n.º 4
0
def execute_command(command, webdriver, browser_settings, browser_params,
                    manager_params, extension_socket):
    """Executes BrowserManager commands
    commands are of form (COMMAND, ARG0, ARG1, ...)
    """
    if command[0] == 'GET':
        browser_commands.get_website(
            url=command[1], sleep=command[2], visit_id=command[3],
            webdriver=webdriver, browser_params=browser_params,
            extension_socket=extension_socket)

    if command[0] == 'BROWSE':
        browser_commands.browse_website(
            url=command[1], num_links=command[2], sleep=command[3],
            visit_id=command[4], webdriver=webdriver,
            browser_params=browser_params, manager_params=manager_params,
            extension_socket=extension_socket)

    if command[0] == 'DUMP_FLASH_COOKIES':
        browser_commands.dump_flash_cookies(
            start_time=command[1], visit_id=command[2],
            webdriver=webdriver, browser_params=browser_params,
            manager_params=manager_params)

    if command[0] == 'DUMP_PROFILE_COOKIES':
        browser_commands.dump_profile_cookies(
            start_time=command[1], visit_id=command[2],
            webdriver=webdriver, browser_params=browser_params,
            manager_params=manager_params)

    if command[0] == 'DUMP_PROF':
        profile_commands.dump_profile(
            browser_profile_folder=browser_params['profile_path'],
            manager_params=manager_params,
            browser_params=browser_params,
            tar_location=command[1], close_webdriver=command[2],
            webdriver=webdriver, browser_settings=browser_settings,
            compress=command[3],
            save_flash=browser_params['disable_flash'] is False)

    if command[0] == 'DUMP_PAGE_SOURCE':
        browser_commands.dump_page_source(
            visit_id=command[2], driver=webdriver,
            manager_params=manager_params, suffix=command[1])

    if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE':
        browser_commands.recursive_dump_page_source(
            visit_id=command[2], driver=webdriver,
            manager_params=manager_params, suffix=command[1])

    if command[0] == 'SAVE_SCREENSHOT':
        browser_commands.save_screenshot(
            visit_id=command[2], crawl_id=browser_params['crawl_id'],
            driver=webdriver, manager_params=manager_params, suffix=command[1])

    if command[0] == 'SCREENSHOT_FULL_PAGE':
        browser_commands.screenshot_full_page(
            visit_id=command[2], crawl_id=browser_params['crawl_id'],
            driver=webdriver, manager_params=manager_params, suffix=command[1])

    if command[0] == 'RUN_CUSTOM_FUNCTION':
        arg_dict = {"command": command,
                    "driver": webdriver,
                    "browser_settings": browser_settings,
                    "browser_params": browser_params,
                    "manager_params": manager_params,
                    "extension_socket": extension_socket}
        command[1](*command[2], **arg_dict)
def _find_and_fill_form(webdriver, user_data, visit_id, debug, browser_params,
                        manager_params, logger):
    """Finds and fills a form, and returns True if accomplished."""
    current_url = webdriver.current_url
    current_site_title = webdriver.title.encode('ascii', 'replace')
    main_handle = webdriver.current_window_handle
    in_iframe = False

    if debug: logger.debug('The current URL is %s' % current_url)

    # debug: save before/after screenshots and page source
    debug_file_prefix = str(visit_id) + '_'
    debug_form_pre_initial = debug_file_prefix + 'form_initial_presubmit'
    debug_form_post_initial = debug_file_prefix + 'form_initial_result'
    debug_form_pre_followup = debug_file_prefix + 'form_followup_presubmit'
    debug_form_post_followup = debug_file_prefix + 'form_followup_result'
    debug_page_source_initial = debug_file_prefix + 'src_initial'
    debug_page_source_followup = debug_file_prefix + 'src_followup'

    newsletter_form = None

    # Search for a modal dialog, and for a form in the modal dialog
    # Search for no more than two modal dialogs
    try:
        search_count = 0
        while (search_count < _MAX_POPUP_DISMISS):
            if debug:
                logger.debug('Round %d of modal dialog search...' %
                             search_count)
            dialog_container = _get_dialog_container(webdriver)
            if dialog_container:
                if debug:
                    logger.debug(
                        'Modal dialog found, searching for newsletter form in dialog...'
                    )
                newsletter_form = _find_newsletter_form(
                    dialog_container, webdriver, debug, logger)

                if newsletter_form is None:
                    clicked = _dismiss_dialog(webdriver, dialog_container)
                    if debug:
                        if int(clicked) > 0:
                            if debug:
                                logger.debug(
                                    'No newsletter form in dialog, dismissed it'
                                )
                        else:
                            if debug:
                                logger.debug(
                                    'Made no clicks to dismiss the dialog')
                                webdriver.find_element_by_tag_name(
                                    'html').send_keys(Keys.ESCAPE)
                                logger.debug(
                                    'Pressed ESC to dismiss the dialog')
                else:
                    if debug:
                        logger.debug('Found a newsletter form in the dialog')
                    break
            else:
                if debug: logger.debug('No dialog on the page')
                break

            search_count += 1
    except Exception as e:
        logger.error('Error while examining for modal dialogs: %s' % str(e))

    # try to find newsletter forms on landing page after dismissing the dialog
    if newsletter_form is None:
        if debug:
            logger.debug(
                'Searching the rest of the page for a newsletter form')
        newsletter_form = _find_newsletter_form(None, webdriver, debug, logger)

    # Search for newsletter forms in iframes
    if newsletter_form is None:
        if debug:
            logger.debug(
                'No newsletter form found on this page, searching for forms in iframes...'
            )

        # search for forms in iframes (if present)
        iframes = webdriver.find_elements_by_tag_name(
            'iframe') + webdriver.find_elements_by_tag_name('frame')
        if debug: logger.debug('Searching in %d iframes' % len(iframes))

        for iframe in iframes:
            try:
                # switch to the iframe
                webdriver.switch_to_frame(iframe)

                # is there a form?
                newsletter_form = _find_newsletter_form(
                    None, webdriver, debug, logger)
                if newsletter_form is not None:
                    if debug:
                        dump_page_source(debug_page_source_initial, webdriver,
                                         browser_params, manager_params)
                        logger.debug(
                            'Found a newsletter in an iframe on this page')
                    in_iframe = True
                    break  # form found, stay on the iframe

                # switch back
                webdriver.switch_to_default_content()
            except Exception as e:
                if debug:
                    logger.error('Error while analyzing an iframe: %s' %
                                 str(e))
                webdriver.switch_to_default_content()

        # still no form?
        if newsletter_form is None:
            if debug: logger.debug('None of the iframes have newsletter forms')
            return False
    elif debug:
        dump_page_source(debug_page_source_initial, webdriver, browser_params,
                         manager_params)

    email = user_data['email']
    user_info = user_data
    _form_fill_and_submit(newsletter_form, user_info, webdriver, True,
                          browser_params, manager_params,
                          debug_form_pre_initial if debug else None)
    logger.info('Submitted form on [%s] with email [%s] on visit_id [%d]',
                current_url, email, visit_id)
    time.sleep(_FORM_SUBMIT_SLEEP)
    _dismiss_alert(webdriver)

    if debug:
        save_screenshot(debug_form_post_initial, webdriver, browser_params,
                        manager_params)
        logger.debug('The current URL is %s' % webdriver.current_url)
        logger.debug('Filling any follow-up forms on this page...')

    # fill any follow-up forms...
    wait_until_loaded(webdriver, _PAGE_LOAD_TIME)  # wait if we got redirected
    follow_up_form = None

    # first check other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Found %d windows (e.g., popups)' % len(windows))
        form_found_in_popup = False
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)

                # find newsletter form
                if follow_up_form is None:
                    follow_up_form = _find_newsletter_form(
                        None, webdriver, debug, logger)
                    if follow_up_form is not None:
                        if debug:
                            dump_page_source(debug_page_source_followup,
                                             webdriver, browser_params,
                                             manager_params)
                            logger.debug(
                                'Found a newsletter form in another window')
                        _form_fill_and_submit(
                            follow_up_form, user_info, webdriver, True,
                            browser_params, manager_params,
                            debug_form_pre_followup if debug else None)

                        logger.info(
                            'Submitted form on [%s] with email [%s] on visit_id [%d]',
                            webdriver.current_url, email, visit_id)

                        time.sleep(_FORM_SUBMIT_SLEEP)
                        _dismiss_alert(webdriver)
                        if debug:
                            save_screenshot(debug_form_post_followup,
                                            webdriver, browser_params,
                                            manager_params)

                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    # else check current page
    if follow_up_form is None:
        if debug:
            logger.debug(
                'Found no follow-up forms in other windows, checking current page'
            )
        follow_up_form = _find_newsletter_form(None, webdriver, debug, logger)
        if follow_up_form is not None:
            if debug:
                dump_page_source(debug_page_source_followup, webdriver,
                                 browser_params, manager_params)
                logger.debug('Found a follow-up form in this page')

            _form_fill_and_submit(follow_up_form, user_info, webdriver, True,
                                  browser_params, manager_params,
                                  debug_form_pre_followup if debug else None)

            logger.info(
                'Submitted form on [%s] with email [%s] on visit_id [%d]',
                webdriver.current_url, email, visit_id)

            time.sleep(_FORM_SUBMIT_SLEEP)
            _dismiss_alert(webdriver)
            if debug:
                save_screenshot(debug_form_post_followup, webdriver,
                                browser_params, manager_params)
        else:
            if debug: logger.debug('No follow-up forms on the current page')

# switch back
    if in_iframe:
        if debug:
            logger.debug(
                'We were in an iframe, switching back to the main window')
        webdriver.switch_to_default_content()

    # close other windows (ex. pop-ups)
    windows = webdriver.window_handles
    if len(windows) > 1:
        if debug:
            logger.debug('Closing %d windows (e.g., popups)' % len(windows))
        for window in windows:
            if window != main_handle:
                webdriver.switch_to_window(window)
                webdriver.close()
        webdriver.switch_to_window(main_handle)
        time.sleep(1)

    return True