def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(command[1], command[2], webdriver, proxy_queue, browser_params, extension_socket) if command[0] == 'BROWSE': browser_commands.browse_website(command[1], command[2], command[3], webdriver, proxy_queue, browser_params, manager_params, extension_socket) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies(command[1], command[2], webdriver, browser_params, manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies(command[1], command[2], webdriver, browser_params, manager_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_params['profile_path'], manager_params, browser_params, command[1], command[2], webdriver, browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params, manager_params)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(command[1], webdriver, proxy_queue, browser_params) if command[0] == 'DUMP_STORAGE_VECTORS': browser_commands.dump_storage_vectors(command[1], command[2], webdriver, browser_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile( browser_params['profile_path'], command[1], command[2], webdriver, browser_settings, save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(command[1], webdriver, proxy_queue, browser_params) if command[0] == 'BROWSE': browser_commands.browse_website(command[1], command[2], webdriver, proxy_queue, browser_params) if command[0] == 'DUMP_STORAGE_VECTORS': browser_commands.dump_storage_vectors(command[1], command[2], webdriver, browser_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_params['profile_path'], command[1], command[2], webdriver, browser_settings, save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params) if command[0] == 'FBLOGIN': facebook_commands.login(webdriver, command[1], command[2], browser_params) if command[0] == 'EXTRACT_FRIENDS': facebook_commands.extract_friends(command[1], command[2], command[3], webdriver, browser_params)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(command[1], webdriver, proxy_queue, browser_params, extension_socket) if command[0] == 'BROWSE': browser_commands.browse_website(command[1], command[2], webdriver, proxy_queue, browser_params, manager_params, extension_socket) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies(command[1], command[2], webdriver, browser_params, manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies(command[1], command[2], webdriver, browser_params, manager_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_params['profile_path'], manager_params, browser_params, command[1], command[2], webdriver, browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params, manager_params)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(url=command[1], sleep=command[2], scroll=command[3], visit_id=command[4], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, extension_socket=extension_socket) if command[0] == 'BROWSE': browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3], visit_id=command[4], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, manager_params=manager_params, extension_socket=extension_socket) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'], manager_params=manager_params, browser_params=browser_params, tar_location=command[1], close_webdriver=command[2], webdriver=webdriver, browser_settings=browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params, manager_params) if command[0] == 'SAVE_SCREENSHOT': browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PAGE_SOURCE': browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'EXTRACT_ELEMENTS': browser_commands.extract_elements(selector=command[1], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'RUN_CUSTOM_FUNCTION': arg_dict = {"command": command, "driver": webdriver, "proxy_queue": proxy_queue, "browser_settings": browser_settings, "browser_params": browser_params, "manager_params": manager_params, "extension_socket": extension_socket} command[1](*command[2], **arg_dict)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(command[1], webdriver, proxy_queue, browser_params) if command[0] == 'DUMP_STORAGE_VECTORS': browser_commands.dump_storage_vectors(command[1], command[2], webdriver, browser_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_params['profile_path'], command[1], command[2], webdriver, browser_settings, save_flash=browser_params['disable_flash'] is False)
def fill_forms(url, email_producer, num_links, page_timeout, debug, visit_id, webdriver, proxy_queue, browser_params, manager_params, extension_socket): """Finds a newsletter form on the page. If not found, visits <num_links> internal links and scans those pages for a form. Submits the form if found. """ # load the site webdriver.set_page_load_timeout(page_timeout) get_website(url, 0, visit_id, webdriver, proxy_queue, browser_params, extension_socket) # connect to the logger logger = loggingclient(*manager_params['logger_address']) # try to find a newsletter form on the landing page if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): return # otherwise, scan more pages main_handle = webdriver.current_window_handle visited_links = set() for i in xrange(num_links): # get all links on the page links = webdriver.find_elements_by_tag_name('a') random.shuffle(links) current_url = webdriver.current_url current_ps1 = domain_utils.get_ps_plus_1(current_url) # find links to click match_links = [] start_time = timeit.default_timer() for link in links: try: if not link.is_displayed(): continue # check if link is valid and not already visited href = link.get_attribute('href') if href is None or href in visited_links: continue # check if this is an internal link if not _is_internal_link(href, current_url, current_ps1): continue link_text = link.text.lower() # skip links with blacklisted text blacklisted = False for bl_text in _LINK_TEXT_BLACKLIST: if bl_text in link_text: blacklisted = True break if blacklisted: continue # should we click this link? link_rank = 0 for type, s, rank, flags in _LINK_TEXT_RANK: if (type == _TYPE_TEXT and s in link_text) or (type == _TYPE_HREF and s in href): if flags & _FLAG_IN_NEW_URL_ONLY: # don't use this link if the current page URL already matches too if type == _TYPE_HREF and s in current_url: continue # link matches! link_rank = rank match_links.append( (link, rank, link_text, href, flags)) break if link_rank >= _LINK_RANK_SKIP: # good enough, stop looking break except: logger.error("error while looping through links...") # quit if too much time passed (for some reason, this is really slow...) if match_links and timeit.default_timer( ) - start_time > _LINK_MATCH_TIMEOUT: break # find the best link to click if not match_links: break # no more links to click match_links.sort(key=lambda l: l[1]) next_link = match_links[-1] visited_links.add(next_link[3]) # click the link try: # load the page logger.info("clicking on link '%s' - %s" % (next_link[2], next_link[3])) next_link[0].click() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) if browser_params['bot_mitigation']: bot_mitigation(webdriver) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): return # should we stay on this page? if next_link[4] & _FLAG_STAY_ON_PAGE: continue # go back webdriver.back() wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # check other windows (ex. pop-ups) windows = webdriver.window_handles if len(windows) > 1: form_found_in_popup = False for window in windows: if window != main_handle: webdriver.switch_to_window(window) wait_until_loaded(webdriver, _PAGE_LOAD_TIME) # find newsletter form if _find_and_fill_form(webdriver, email_producer, visit_id, debug, browser_params, manager_params, logger): form_found_in_popup = True webdriver.close() webdriver.switch_to_window(main_handle) time.sleep(1) if form_found_in_popup: return except: pass
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_socket): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website(url=command[1], sleep=command[2], visit_id=command[3], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, extension_socket=extension_socket) if command[0] == 'BROWSE': browser_commands.browse_website(url=command[1], num_links=command[2], sleep=command[3], visit_id=command[4], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, manager_params=manager_params, extension_socket=extension_socket) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROF': return profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'], manager_params=manager_params, browser_params=browser_params, tar_location=command[1], close_webdriver=command[2], webdriver=webdriver, browser_settings=browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params, manager_params) #====== browser_commands.extract_links_with_extension(webdriver, browser_params, manager_params) if command[0] == 'LOGIN': #browser_commands.extract_links(webdriver, browser_params, manager_params) browser_commands.login_google(webdriver, browser_params, manager_params) if command[0] == 'LOAD_PROFILE': # Do not use this command! There is built-in function that # can be activated by browser_params.json profile_commands.load_profile(browser_profile_folder=browser_params['profile_path'], manager_params=manager_params, browser_params=browser_params) #if command[0] == 'SEARCH_GOOGLE_SHOP': # browser_commands.search_google_shopping(webdriver=webdriver, # browser_params=browser_params, # manager_params=manager_params, # number_of_links_to_click=3, # training=True) if command[0] == 'SINGLE_SEARCH_GOOGLE_SHOP': browser_commands.single_search_google_shopping(webdriver=webdriver, term = command[1], browser_params=browser_params, manager_params=manager_params, number_of_links_to_click=command[2], training=command[3]) if command[0] == 'SINGLE_SEARCH_GOOGLE_SHOP_BY_INDEX': browser_commands.single_search_google_shopping_by_index(webdriver=webdriver, index_of_term = command[1], browser_params=browser_params, manager_params=manager_params, training=command[2]) # usage: if command[0] == 'MULTIPLE_SEARCH_GOOGLE_SHOP': browser_commands.multiple_search_google_shopping(webdriver=webdriver, browser_params=browser_params, manager_params=manager_params, number_of_links_to_click=command[1], training=command[2]) if command[0] == 'BROWSE_TRAINING_SITE_BY_INDEX': browser_commands.browser_website_by_index(index_of_url=command[1], sleep=command[2], visit_id=command[3], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params,manager_params=manager_params, extension_socket=extension_socket)
def execute_command(command, webdriver, proxy_queue, browser_settings, browser_params, manager_params, extension_sockets): """ executes BrowserManager commands by passing command tuples into necessary helper function commands are of form (COMMAND, ARG0, ARG1, ...) the only imports in this file should be imports to helper libraries """ if command[0] == 'GET': browser_commands.get_website( url=command[1], sleep=command[2], visit_id=command[3], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, extension_sockets=extension_sockets ) if command[0] == 'BROWSE': browser_commands.browse_website( url=command[1], num_links=command[2], sleep=command[3], visit_id=command[4], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, manager_params=manager_params, extension_sockets=extension_sockets ) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies(start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile(browser_profile_folder=browser_params['profile_path'], manager_params=manager_params, browser_params=browser_params, tar_location=command[1], close_webdriver=command[2], webdriver=webdriver, browser_settings=browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'EXTRACT_LINKS': browser_commands.extract_links(webdriver, browser_params, manager_params) if command[0] == 'SAVE_SCREENSHOT': browser_commands.save_screenshot(screenshot_name=command[1], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PAGE_SOURCE': browser_commands.dump_page_source(dump_name=command[1], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'BROWSE_AND_DUMP_SOURCE': browser_commands.browse_and_dump_source( url=command[1], num_links=command[2], sleep=command[3], visit_id=command[4], webdriver=webdriver, proxy_queue=proxy_queue, browser_params=browser_params, manager_params=manager_params, extension_sockets=extension_sockets ) if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE': browser_commands.recursive_dump_page_source( visit_id=command[2], driver=webdriver, manager_params=manager_params, suffix=command[1] ) if command[0] == 'FACEBOOK_LOGIN': facebook_commands.facebook_login( driver=webdriver, url=command[1], visit_id=command[2], manager_params=manager_params, browser_params=browser_params ) if command[0] == 'REQUEST_FILTER': browser_commands.request_filter( control_message=command[1], filter_name=command[2], crawl_id=browser_params['crawl_id'], extension_sockets=extension_sockets, manager_params=manager_params ) if command[0] == 'RUN_CUSTOM_FUNCTION': arg_dict = {"command": command, "driver": webdriver, "proxy_queue": proxy_queue, "browser_settings": browser_settings, "browser_params": browser_params, "manager_params": manager_params, "extension_sockets": extension_sockets} command[1](*command[2], visit_id=command[3], **arg_dict)
def execute_command(command, webdriver, browser_settings, browser_params, manager_params, extension_socket): """Executes BrowserManager commands commands are of form (COMMAND, ARG0, ARG1, ...) """ if command[0] == 'GET': browser_commands.get_website( url=command[1], sleep=command[2], visit_id=command[3], webdriver=webdriver, browser_params=browser_params, extension_socket=extension_socket) if command[0] == 'BROWSE': browser_commands.browse_website( url=command[1], num_links=command[2], sleep=command[3], visit_id=command[4], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params, extension_socket=extension_socket) if command[0] == 'DUMP_FLASH_COOKIES': browser_commands.dump_flash_cookies( start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROFILE_COOKIES': browser_commands.dump_profile_cookies( start_time=command[1], visit_id=command[2], webdriver=webdriver, browser_params=browser_params, manager_params=manager_params) if command[0] == 'DUMP_PROF': profile_commands.dump_profile( browser_profile_folder=browser_params['profile_path'], manager_params=manager_params, browser_params=browser_params, tar_location=command[1], close_webdriver=command[2], webdriver=webdriver, browser_settings=browser_settings, compress=command[3], save_flash=browser_params['disable_flash'] is False) if command[0] == 'DUMP_PAGE_SOURCE': browser_commands.dump_page_source( visit_id=command[2], driver=webdriver, manager_params=manager_params, suffix=command[1]) if command[0] == 'RECURSIVE_DUMP_PAGE_SOURCE': browser_commands.recursive_dump_page_source( visit_id=command[2], driver=webdriver, manager_params=manager_params, suffix=command[1]) if command[0] == 'SAVE_SCREENSHOT': browser_commands.save_screenshot( visit_id=command[2], crawl_id=browser_params['crawl_id'], driver=webdriver, manager_params=manager_params, suffix=command[1]) if command[0] == 'SCREENSHOT_FULL_PAGE': browser_commands.screenshot_full_page( visit_id=command[2], crawl_id=browser_params['crawl_id'], driver=webdriver, manager_params=manager_params, suffix=command[1]) if command[0] == 'RUN_CUSTOM_FUNCTION': arg_dict = {"command": command, "driver": webdriver, "browser_settings": browser_settings, "browser_params": browser_params, "manager_params": manager_params, "extension_socket": extension_socket} command[1](*command[2], **arg_dict)