browser_params[i]['profile_tar'] = 'Output/Profiles/Facebook_3' #self written browser_params[i]['scroll_down'] = False browser_params[i]['login'] = False browser_params[i]['execute_tshark'] = False # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = 'Output/Data' manager_params['log_directory'] = 'Output/Data' #manager_params['database_name'] = 'output.sqlite' default_timeout = 60 default_sleep = 5 manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites_to_crawl: # define crawl actions command_sequence_get1 = CommandSequence.CommandSequence(site, reset=False) command_sequence_get1.get(step=0, sleep=default_sleep, timeout=default_timeout) command_sequence_get1.dump_profile_cookies(timeout=default_timeout) command_sequence_get1.dump_flash_cookies(timeout=default_timeout) manager.execute_command_sequence(command_sequence_get1, index='**') # ** = synchronized browsers # Shuts down the browsers and waits for the data to finish logging
'http://www.example.com', 'http://www.princeton.edu', 'https://citp.princeton.edu/' ] # Saves a crawl output DB to the Desktop db_loc = os.path.expanduser('~/Desktop/openwpm_demo.sqlite') # Loads 3 copies of the default browser preference dictionaries browser_params = TaskManager.load_default_params(NUM_BROWSERS) #Enable flash for all three browsers for i in xrange(NUM_BROWSERS): browser_params[i]['disable_flash'] = False #Launch the first browser headless browser_params[0]['headless'] = True # Instantiates the measurement platform # Launches two (non-headless) Firefox instances and one headless instance # logging data with MITMProxy # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(db_loc, browser_params, NUM_BROWSERS) # Visits the sites with both browsers simultaneously, 5 seconds between visits for site in sites: manager.get(site, index='**') # ** = synchronized browsers time.sleep(5) # Shuts down the browsers and waits for the data to finish logging manager.close()
browser_params[i]['navigation_instrument'] = True # Launch headless browser_params[i]['display_mode'] = 'headless' cwd = os.getcwd() + "/temp" manager_params['data_directory'] = cwd manager_params['log_directory'] = cwd # Instantiates the measurement platform # Commands time out by default after 60 seconds logger_params = { "log_level_console": logging.CRITICAL } manager = TaskManager.TaskManager( manager_params, browser_params, logger_kwargs=logger_params ) # Define command sequence command_sequence = CommandSequence.CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done".format(val)) ) command_sequence.get(sleep=3, timeout=60) def search_and_visit(query, n, **kwargs): driver = kwargs['driver'] input_elem = driver.find_element_by_name("q")
raise except Exception: continue if href is None: continue href = urlparse.urljoin(driver.current_url, href) if (not href.startswith('http') or du.get_ps_plus_1(href) != top_ps1): continue intra_links.add(urlparse.urldefrag(href)[0]) with open(out_file, 'w') as f: json.dump([top_url, list(intra_links)], f) manager = TaskManager.TaskManager(manager_params, browser_params, process_watchdog=True) current_index = 0 for i in range(start_index, end_index): current_index = i if current_index >= TOTAL_NUM_SITES: break try: url = sites[i] cs = CommandSequence.CommandSequence('http://' + url, reset=True) cs.get(sleep=10, timeout=120) cs.run_custom_function(save_links_to_file, func_args=(url, ), timeout=120) manager.execute_command_sequence(cs) with open(os.path.expanduser('~/.openwpm/current_site_index'),
manager_params['log_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY manager_params['output_format'] = 's3' manager_params['s3_bucket'] = S3_BUCKET manager_params['s3_directory'] = CRAWL_DIRECTORY # Allow the use of localstack's mock s3 service S3_ENDPOINT = os.getenv('S3_ENDPOINT') if S3_ENDPOINT: boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT) manager_params['s3_bucket'] = local_s3_bucket(boto3.resource('s3'), name=S3_BUCKET) # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params, logger_kwargs=LOGGER_SETTINGS) # At this point, Sentry should be initiated if SENTRY_DSN: # Add crawler.py-specific context with sentry_sdk.configure_scope() as scope: # tags generate breakdown charts and search filters scope.set_tag('CRAWL_DIRECTORY', CRAWL_DIRECTORY) scope.set_tag('S3_BUCKET', S3_BUCKET) scope.set_tag('HTTP_INSTRUMENT', HTTP_INSTRUMENT) scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT) scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT) scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT) scope.set_tag('JS_INSTRUMENT_MODULES', JS_INSTRUMENT) scope.set_tag('SAVE_CONTENT', SAVE_CONTENT)