def main(): wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) article_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(article_links): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_articles_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def main(): with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile: soup = BeautifulSoup(myfile.read(), 'lxml') links = [] with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile: for item in soup.find_all('a', attrs={'data-link': True}): if "data-link" in item.attrs: if ".html" in item['data-link']: outfile.write(item['data-link']) outfile.write("\n") links.append(item['data-link']) # Go and dump the source for each manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(links): command_sequence = CommandSequence.CommandSequence(link) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_ad_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def main(): pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$") wpm_db = "/home/jason/Desktop/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) native_ad_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/analysis' manager_params['log_directory'] = '~/Desktop/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(native_ad_links): if not pattern.match(link[1]): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("ads" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000): NUM_BROWSERS = number_of_browsers SITES = ['http://' + x for x in cu.sample_top_sites( location=os.path.expanduser('~/Desktop/'), slices=[(10000, 0, 10000), (10000, 10000, slice_end)])] manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) for i in range(NUM_BROWSERS): browser_params[i]['cookie_instrument'] = True browser_params[i]['js_instrument'] = True browser_params[i]['save_javascript'] = True browser_params[i]['http_instrument'] = True browser_params[i]['headless'] = True browser_params[i]['disable_flash'] = False browser_params[i]['save_documents'] = True browser_params[i]['caching_disabled'] = True manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' count = 0 manager = TaskManager.TaskManager(manager_params, browser_params) for site in SITES[0:exit_crawl_after]: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=10, timeout=60) command_sequence.scroll_page() command_sequence.recursive_dump_page_source() manager.execute_command_sequence(command_sequence) count += 1 if count % 1000 == 0: print "Total crawled: ", count manager.close()
def run_search_google_training_by_multiple_commands(self, tmpdir): """visit all the training site. each visit is a single command """ # get the size of training sites, and visit one by one using index in their list; # this is to avoid the problem of there is error when visit one site could stop whole # visiting process (in case of using single CommandSequence); # all the browser must have the same number of visting site manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS) with open(browser_params[0]['training_keywords']) as _f: _sites = [site for site in _f] nu_sites = len(_sites) cs = CommandSequence.CommandSequence("http://www.example.com") #cs2 = CommandSequence.CommandSequence("none") # url is a place holder #cs.get(sleep=3) #cs2.login_google() #manager.execute_command_sequence(cs2, index="**") for i in range(0, nu_sites): cs.single_search_google_shopping_by_index(i, -1, training=True) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(5) manager.close() print("finish....")
def crawl_site(site, manager, user_data): command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.fill_forms(user_data=user_data, num_links=3, timeout=120, page_timeout=8, debug=True) manager.execute_command_sequence(command_sequence)
def _stateless_crawl(self, sites): '''Performs a crawl with sites providing login''' manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: params = self._fetch_params(site) commandseq = CommandSequence.CommandSequence(site, reset=True) commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(commandseq, index=None) manager.close()
def dump_crawl(sites,profile_name): #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') # The list of sites that we wish to crawl print sites,profile_name NUM_BROWSERS = 1 #3 # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses #browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[i]['headless'] = True # Launch all and not only browser 0 headless browser_params[i]['js_instrument'] = True # browser_params[i]['save_javascript'] = True #browser_params[i]['random_attributes']=True browser_params[i]['cookie_instrument']=True # browser_params[i]['cp_instrument']=True # browser_params[i]['save_all_content']=True if 'load_name' in locals(): browser_params[i]['profile_tar']=load_name browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/OpenWPM/' manager_params['log_directory'] = '~/OpenWPM/' manager_params['database_name']= "persona.sqlite" # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for i in range(0,len(sites)): print sites[i] site=sites[i] command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=300) # index='**' synchronizes visits between the three browsers #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True) manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS)) time.sleep(2) # dump_profile_cookies/dump_flash_cookies closes the current tab. # dump stores history last cookies/sites only stored # os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout) # Shuts down the browsers and waits for the data to finish logging manager.close()
def _statefull_crawl(self, loginsite, sites): '''Performs crawl by logging into one site and regularly crawling others''' manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) # login to given page params = self._fetch_params(loginsite) commandseq = CommandSequence.CommandSequence(loginsite) commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(commandseq, index=None) # proceed to crawl pages for site in sites: # we run a stateless crawl (fresh profile for each page) command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME) command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME) manager.execute_command_sequence(command_sequence, index=None) manager.close()
def callWPM(NUM_BROWSERS, siteslist): print("Thread-----------thread-------------thread-----") manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) browser_params[0]['http_instrument'] = True browser_params[0]['disable_flash'] = False browser_params[0]['headless'] = True manager_params['data_directory'] = '../database/requestdata2/' manager_params['log_directory'] = '../database/requestdata2/' manager = TaskManager.TaskManager(manager_params, browser_params) for site in siteslist: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=0, timeout=10) manager.execute_command_sequence(command_sequence, index='**') manager.close()
def run_once(self): for stage in self.stages: command_sequence = CommandSequence.CommandSequence(stage.site) if isinstance(stage.actions, list): for action in stage.actions: action(command_sequence) else: stage.actions(command_sequence) if stage.group == 'experimental': self.manager.execute_command_sequence(command_sequence, index='experimental') else: self.manager.execute_command_sequence(command_sequence, index='**') self.manager.close()
def browser_training_site(self, tmpdir): manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS) with open(browser_params[0]['training_sites']) as _f: _sites = [site for site in _f] nu_sites = len(_sites) cs = CommandSequence.CommandSequence("http://www.example.com") #cs.get() for i in range(len(_sites)): cs.browse_site_by_index(i, 3) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(10) manager.close()
def crawl(self, sites): '''Runs crawl resulting in dataset for unsupervised tracking detection Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de''' self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE) self.browserpar['disable_flash'] = True for _ in range(0, self.NUM_USERS): manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: for _ in range(0, self.NUM_VISITS): command_sequence = CommandSequence.CommandSequence(site) command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(command_sequence, index=None) manager.close()
def crawl(self, sites): '''Runs a crawl to measure various metrics regarding third-party tracking. Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de''' self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE) manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: # we run a stateless crawl (fresh profile for each page) command_sequence = CommandSequence.CommandSequence(site, reset=True) # Start by visiting the page command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME) command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME) manager.execute_command_sequence(command_sequence, index=None) manager.close()
def test(self, tmpdir): # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs = CommandSequence.CommandSequence("http://www.google.com") cs.get(sleep=3) #cs.get() #cs.login_google() #cs.search_google_shopping() #cs.single_search_google_shopping("food",training=False) #time.sleep(2) #cs.single_search_google_shopping("baby powder", number of link, trainig...) cs.multiple_search_google_shopping(-1, training=False, sleep_time=2) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(15) manager.close(post_process=False) print("finish....")
def run_demo(url): NUM_BROWSERS = 1 sites = [str(url)] # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = False browser_params[i]['js_instrument'] = True if platform != 'darwin': browser_params[0]['headless'] = True # Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = 'feature_extraction/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=60) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close()
db_path = '/Users/sanjanaaithal/Desktop/Vanilla' # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = db_path manager_params['log_directory'] = db_path # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visit the sites for numpy_object in top100_df.to_numpy(): # Turn numpy object into a site string site = 'http://' + str(numpy_object)[2:-2] # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence.CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val))) # Start by visiting the page command_sequence.get(sleep=3, timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()
browser_params[i]['disable_flash'] = True # Disable flash for all browsers browser_params[i]['js_instrument'] = True # Enable JS instrumentation browser_params[i]['save_javascript'] = True # save JS files browser_params[i]['headless'] = True # headless browser_params[i]['trigger_sensor_events'] = True # fake sensor events browser_params[i]['mobile_platform'] = "android" # or "iphone" # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/openwpm_mobile_100k/' manager_params['log_directory'] = '~/openwpm_mobile_100k/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for rank, site in enumerate(sites, 1): url = "http://%s" % site command_sequence = CommandSequence.CommandSequence(url, reset=True) # Start by visiting the page command_sequence.get(sleep=10, timeout=60) # command_sequence.save_screenshot('%d_%s_screenshot' % (rank, site)) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(120) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()
# Send a sentry error message (temporarily - to easily be able # to compare error frequencies to crawl worker instance count) sentry_sdk.capture_message("Crawl worker started") # Connect to job queue job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST) manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID()) manager.logger.info("Initial queue state: empty=%s" % job_queue.empty()) # Crawl sites specified in job queue until empty while not job_queue.empty(): job = job_queue.lease(lease_secs=120, block=True, timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) else: site_rank, site = job.decode("utf-8").split(',') if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT) manager.execute_command_sequence(command_sequence) job_queue.complete(job) manager.logger.info("Job queue finished, exiting.") manager.close() if SENTRY_DSN: sentry_sdk.capture_message("Crawl worker finished")
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty()) # Crawl sites specified in job queue until empty while not job_queue.empty(): job_queue.check_expired_leases() job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) continue retry_number = job_queue.get_retry_number(job) site_rank, site = job.decode("utf-8").split(',') if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) command_sequence = CommandSequence.CommandSequence( site, blocking=True, reset=True, retry_number=retry_number) command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT) manager.execute_command_sequence(command_sequence) job_queue.complete(job) manager.logger.info("Job queue finished, exiting.") manager.close() if SENTRY_DSN: sentry_sdk.capture_message("Crawl worker finished")
#!/usr/bin/python from automation import CommandSequence, TaskManager def run_custom_function(**kwargs): driver = kwargs['driver'] url_title = driver.title print("Title: %s" % url_title) return if __name__ == "__main__": url_list = ["https://google.com"] manager_params, browser_params = TaskManager.load_default_params(1) manager = TaskManager.TaskManager(manager_params, browser_params) for URL in url_list: cs = CommandSequence.CommandSequence(URL) cs.get(sleep=10, timeout=60) cs.run_custom_function(run_custom_function) manager.execute_command_sequence(cs) manager.close()
from automation import TaskManager, CommandSequence # Variables for what site site = 'http://www.tbrandstudio.com/our-work/' # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT' manager_params['log_directory'] = '~/Desktop/NYT' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=180) #command_sequence.scroll_bottom(timeout=180) command_sequence.dump_page_source("nyt_ads", 120) manager.execute_command_sequence(command_sequence, index=None) # Shuts down the browsers and waits for the data to finish logging manager.close()
sites = category_sites[category] if category in set(['Recreation', 'Computers', 'Shopping']): continue f.write('{0}, {1}'.format(i, category)) manager_params['data_directory'] = './{0}_data'.format(category) manager_params['database_name'] = 'crawl-data.sqlite' manager_params['log_file'] = 'openwpm.log' browser_params[0]['profile_archive_dir'] = './{0}_profile'.format(category) browser_params[0]['profile_tar'] = None manager = TaskManager.TaskManager(manager_params, browser_params) for site in sites: try: site = site.lower() if not site.startswith('http'): site = 'http://' + site command_sequence = CommandSequence.CommandSequence(site, reset=False) # Start by visiting the page command_sequence.get(sleep=0, timeout=60) command_sequence.dump_page_source(site.split('/')[2], timeout=60) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(120) command_sequence.dump_profile(category) # execute on the ith browser manager.execute_command_sequence(command_sequence, index=0) except: print 'failed on site: ' + site pass manager.close() i += 1 manager.close()
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) prefix = 'test_browse' manager_params['database_name'] = prefix + '.sqlite' manager_params['data_directory'] = '~/Desktop/' + prefix manager_params['log_directory'] = '~/Desktop/' + prefix # Read the site list sites = crawl_utils.get_sampled_sites( location=manager_params['data_directory'], include_rank=True, slices=[(100, 0, 100)]) for i in xrange(NUM_BROWSERS): browser_params[i]['js_instrument'] = True browser_params[i]['cookie_instrument'] = True browser_params[i]['http_instrument'] = True browser_params[i]['save_javascript'] = True browser_params[i]['record_js_errors'] = True manager = TaskManager.TaskManager(manager_params, browser_params) current_index = 0 for i in range(len(sites)): cs = CommandSequence.CommandSequence(sites[i][0], site_rank=sites[i][1], reset=True) cs.browse(num_links=5, sleep=5, timeout=120) manager.execute_command_sequence(cs) manager.close()
# Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = './Results/' manager_params['log_directory'] = './Results/' # Instantiates the measurement platform manager = TaskManager.TaskManager(manager_params, browser_params) fileReader = csv.reader(open('detection/alexa/top-1m.csv'), delimiter=',') urls = [] for (index, url) in fileReader: urls.append(url) del fileReader for i in range(0, 1, 1): #len(urls),1): url = urls[i] print("Command creation %s %s" % (i, url)) #second parameter will clear the profile (reset) command_sequence = CommandSequence.CommandSequence('http://' + url, True) # Start by visiting the page command_sequence.get(sleep=3, timeout=120) #command_sequence.save_screenshot('EndPrint', 1000) command_sequence.detect_webbot_detection(timeout=360) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index=None) del command_sequence # Shuts down the browsers and waits for the data to finish logging manager.close()
manager = TaskManager.TaskManager(manager_params, browser_params, process_watchdog=True) current_index = 0 for i in range(start_index, end_index): current_index = i if current_index >= TOTAL_NUM_SITES: break try: try: first_party, rank, url = sites[i] except ValueError: continue cs = CommandSequence.CommandSequence(url, site_rank=rank, first_party=first_party, reset=True) cs.get(sleep=10, timeout=120) manager.execute_command_sequence(cs) with open(os.path.expanduser('~/.openwpm/current_site_index'), 'w') as f: f.write(str(i)) except CommandExecutionError: with open(os.path.expanduser('~/.openwpm/reboot'), 'w') as f: f.write(str(1)) break print "CLOSING TaskManager after batch" manager.close() crawl_utils.clear_tmp_folder()
def run_openwpm(sites, data_directory, run_id, data_base_name): """ run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db """ print 'number of passed typo candidates ', len(sites) NUM_BROWSERS = 3 try: print data_directory print run_id # Instantiates the measurement platform # Commands time out by default after 60 seconds picked_typo_candidates = set([]) # Visits the sites with all browsers simultaneously for typo_candidate in sites: picked_typo_candidates.add("http://" + typo_candidate) if len(picked_typo_candidates) % 400 == 399: time.sleep(10) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) except: #print ValueError pass
EXTENDED_LEASE_TIME): manager.logger.error("Unsaved job: %s timed out", unsaved_job) job = job_queue.lease( lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5 ) if job is None: manager.logger.info("Waiting for work") time.sleep(5) continue unsaved_jobs.append(job) retry_number = job_queue.get_retry_number(job) site_rank, site = job.decode("utf-8").split(',') if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) callback = get_job_completion_callback( manager.logger, unsaved_jobs_lock, job_queue, job) command_sequence = CommandSequence.CommandSequence( site, blocking=True, reset=True, retry_number=retry_number, callback=callback, site_rank=site_rank ) command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT) manager.execute_command_sequence(command_sequence) manager.logger.info("Job queue finished, exiting.") manager.close() if SENTRY_DSN: sentry_sdk.capture_message("Crawl worker finished")
browser_params[i]['execute_tshark'] = False # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '/home/OpenWPM/Output/Data' manager_params['log_directory'] = '/home/OpenWPM/Output/Data' #manager_params['database_name'] = 'output.sqlite' default_timeout = 60 default_sleep = 5 manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites_to_crawl: # define crawl actions command_sequence_get1 = CommandSequence.CommandSequence(site, reset=False) command_sequence_get1.get(step=0, sleep=default_sleep, timeout=default_timeout) command_sequence_get1.dump_profile_cookies(timeout=default_timeout) command_sequence_get1.dump_flash_cookies(timeout=default_timeout) command_sequence_get2 = CommandSequence.CommandSequence(site + "-sub1", reset=False) command_sequence_get2.get(step=1, sleep=default_sleep, timeout=default_timeout) command_sequence_get2.dump_profile_cookies(timeout=default_timeout) command_sequence_get2.dump_flash_cookies(timeout=default_timeout) command_sequence_get3 = CommandSequence.CommandSequence(site + "-sub2",
start_index = int(f.read()) + 1 end_index = start_index + NUM_BATCH else: start_index = 0 end_index = NUM_BATCH + 1 # Start crawling manager = TaskManager.TaskManager(manager_params, browser_params) current_index = 0 for i in range(start_index, end_index): current_index = i if current_index >= TOTAL_NUM_SITES: break try: command_sequence = CommandSequence.CommandSequence(sites[i][1], sites[i][0], reset=True) command_sequence.get(sleep=10, timeout=60) manager.execute_command_sequence(command_sequence) with open(os.path.expanduser('~/.openwpm/current_site_index'), 'w') as f: f.write(str(i)) except CommandExecutionError: with open(os.path.expanduser('~/.openwpm/stop'), 'w') as f: f.write(str(1)) break # Shut down and clean up after batch manager.close() cu.clear_tmp_folder()