def main(): pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$") wpm_db = "/home/jason/Desktop/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) native_ad_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/analysis' manager_params['log_directory'] = '~/Desktop/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(native_ad_links): if not pattern.match(link[1]): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("ads" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def extract_via_openwpm(sites): """ Utilise the OpenWPM package to extract the browser parameters for provided sites """ print '########## OpenWPM (start) (Englehardt, 2016) ##########' # The number of browsers to use to extract the data num_of_browsers = 1 # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( num_of_browsers) # Update browser configuration (use this for per-browser settings) for i in xrange(num_of_browsers): browser_params[i][ 'disable_flash'] = False # Enable flash for all three browsers # browser_params[0]['headless'] = True #Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visit the sites for site in sites: manager.get(site, index='**') # ** = synchronized browsers # Shuts down the browsers and waits for the data to finish logging manager.close() print '########## OpenWPM (end) (Englehardt, 2016) ##########'
def init_manager(self): # Loads the manager preference and NUM_BROWSERS copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( self.num_browsers) assignments = self.randomized_assignments() self.block_assignments.append(assignments) # Update browser configuration (use this for per-browser settings) for i in range(self.num_browsers): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = False browser_params[i]['headless'] = True browser_params[i]['control'] = assignments[i] # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = self.data_directory manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds try: manager = TaskManager.TaskManager(manager_params, browser_params) except TypeError: raise Exception("Failed to start the manager") self.manager = manager
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000): NUM_BROWSERS = number_of_browsers SITES = ['http://' + x for x in cu.sample_top_sites( location=os.path.expanduser('~/Desktop/'), slices=[(10000, 0, 10000), (10000, 10000, slice_end)])] manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) for i in range(NUM_BROWSERS): browser_params[i]['cookie_instrument'] = True browser_params[i]['js_instrument'] = True browser_params[i]['save_javascript'] = True browser_params[i]['http_instrument'] = True browser_params[i]['headless'] = True browser_params[i]['disable_flash'] = False browser_params[i]['save_documents'] = True browser_params[i]['caching_disabled'] = True manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' count = 0 manager = TaskManager.TaskManager(manager_params, browser_params) for site in SITES[0:exit_crawl_after]: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=10, timeout=60) command_sequence.scroll_page() command_sequence.recursive_dump_page_source() manager.execute_command_sequence(command_sequence) count += 1 if count % 1000 == 0: print "Total crawled: ", count manager.close()
def main(): with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile: soup = BeautifulSoup(myfile.read(), 'lxml') links = [] with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile: for item in soup.find_all('a', attrs={'data-link': True}): if "data-link" in item.attrs: if ".html" in item['data-link']: outfile.write(item['data-link']) outfile.write("\n") links.append(item['data-link']) # Go and dump the source for each manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(links): command_sequence = CommandSequence.CommandSequence(link) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_ad_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def main(): wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) article_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(article_links): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_articles_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def dump_crawl(sites,profile_name): #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') # The list of sites that we wish to crawl print sites,profile_name NUM_BROWSERS = 1 #3 # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses #browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[i]['headless'] = True # Launch all and not only browser 0 headless browser_params[i]['js_instrument'] = True # browser_params[i]['save_javascript'] = True #browser_params[i]['random_attributes']=True browser_params[i]['cookie_instrument']=True # browser_params[i]['cp_instrument']=True # browser_params[i]['save_all_content']=True if 'load_name' in locals(): browser_params[i]['profile_tar']=load_name browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/OpenWPM/' manager_params['log_directory'] = '~/OpenWPM/' manager_params['database_name']= "persona.sqlite" # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for i in range(0,len(sites)): print sites[i] site=sites[i] command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=300) # index='**' synchronizes visits between the three browsers #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True) manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS)) time.sleep(2) # dump_profile_cookies/dump_flash_cookies closes the current tab. # dump stores history last cookies/sites only stored # os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout) # Shuts down the browsers and waits for the data to finish logging manager.close()
def callWPM(NUM_BROWSERS, siteslist): print("Thread-----------thread-------------thread-----") manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) browser_params[0]['http_instrument'] = True browser_params[0]['disable_flash'] = False browser_params[0]['headless'] = True manager_params['data_directory'] = '../database/requestdata2/' manager_params['log_directory'] = '../database/requestdata2/' manager = TaskManager.TaskManager(manager_params, browser_params) for site in siteslist: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=0, timeout=10) manager.execute_command_sequence(command_sequence, index='**') manager.close()
def run_search_google_training_by_multiple_commands(self, tmpdir): """visit all the training site. each visit is a single command """ # get the size of training sites, and visit one by one using index in their list; # this is to avoid the problem of there is error when visit one site could stop whole # visiting process (in case of using single CommandSequence); # all the browser must have the same number of visting site manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS) with open(browser_params[0]['training_keywords']) as _f: _sites = [site for site in _f] nu_sites = len(_sites) cs = CommandSequence.CommandSequence("http://www.example.com") #cs2 = CommandSequence.CommandSequence("none") # url is a place holder #cs.get(sleep=3) #cs2.login_google() #manager.execute_command_sequence(cs2, index="**") for i in range(0, nu_sites): cs.single_search_google_shopping_by_index(i, -1, training=True) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(5) manager.close() print("finish....")
def _stateless_crawl(self, sites): '''Performs a crawl with sites providing login''' manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: params = self._fetch_params(site) commandseq = CommandSequence.CommandSequence(site, reset=True) commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(commandseq, index=None) manager.close()
def run_demo(url): NUM_BROWSERS = 1 sites = [str(url)] # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = False browser_params[i]['js_instrument'] = True if platform != 'darwin': browser_params[0]['headless'] = True # Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = 'feature_extraction/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=60) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close()
def run_site_crawl(db_path, sites, preferences, dump_location): """ runs the crawl itself <db_path> is the absolute path of crawl database <preferences> is a dictionary of preferences to initialize the crawler """ manager = TaskManager.TaskManager(db_path, preferences, 1) for site in sites: manager.get(site) if dump_location: manager.dump_profile(dump_location, True) manager.close()
def browser_training_site(self, tmpdir): manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS) with open(browser_params[0]['training_sites']) as _f: _sites = [site for site in _f] nu_sites = len(_sites) cs = CommandSequence.CommandSequence("http://www.example.com") #cs.get() for i in range(len(_sites)): cs.browse_site_by_index(i, 3) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(10) manager.close()
def crawl(self, sites): '''Runs crawl resulting in dataset for unsupervised tracking detection Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de''' self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE) self.browserpar['disable_flash'] = True for _ in range(0, self.NUM_USERS): manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: for _ in range(0, self.NUM_VISITS): command_sequence = CommandSequence.CommandSequence(site) command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(command_sequence, index=None) manager.close()
def crawl(self, sites): '''Runs a crawl to measure various metrics regarding third-party tracking. Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de''' self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE) manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) for site in sites: # we run a stateless crawl (fresh profile for each page) command_sequence = CommandSequence.CommandSequence(site, reset=True) # Start by visiting the page command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME) command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME) manager.execute_command_sequence(command_sequence, index=None) manager.close()
def test(self, tmpdir): # Run the test crawl manager_params, browser_params = self.get_config(str(tmpdir)) manager = TaskManager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs = CommandSequence.CommandSequence("http://www.google.com") cs.get(sleep=3) #cs.get() #cs.login_google() #cs.search_google_shopping() #cs.single_search_google_shopping("food",training=False) #time.sleep(2) #cs.single_search_google_shopping("baby powder", number of link, trainig...) cs.multiple_search_google_shopping(-1, training=False, sleep_time=2) manager.execute_command_sequence(cs, index="**") #manager.get("http://www.google.com") time.sleep(15) manager.close(post_process=False) print("finish....")
def _statefull_crawl(self, loginsite, sites): '''Performs crawl by logging into one site and regularly crawling others''' manager = TaskManager.TaskManager(self.managerpar, [self.browserpar]) # login to given page params = self._fetch_params(loginsite) commandseq = CommandSequence.CommandSequence(loginsite) commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT) manager.execute_command_sequence(commandseq, index=None) # proceed to crawl pages for site in sites: # we run a stateless crawl (fresh profile for each page) command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME) command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME) manager.execute_command_sequence(command_sequence, index=None) manager.close()
from automation import CommandSequence, TaskManager import pandas as pd # Declare constants NUM_BROWSERS = 3 # Load a pd dataframe using the given csv file top100_df = pd.read_csv('top-1m.csv', header=None, nrows=100, usecols=[1]) # Load the default manager params and NUM_BROWSER copies of the default browser params manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Record cookie changes browser_params[i]['cookie_instrument'] = True # Record JS Web API calls browser_params[i]['js_instrument'] = True # Set true for ad blocking mode. Set false for vanilla mode. browser_params[i]['ublock-origin'] = False # Do not record the callstack of all WebRequests made browser_params[i]['callstack_instrument'] = False # Do not record Navigations browser_params[i]['navigation_instrument'] = False # Set the display quality to headful browser_params[i]['display mode'] = 'headful'
import sys from automation import TaskManager, CommandSequence # Load the sites of sites we we wish to crawl into a list # E.g. ['http://www.example.com', 'http://dataskydd.net'] sites = [line.rstrip('\n') for line in open('municipalities_final_urls.txt')] manager_params, browser_params = TaskManager.load_default_params(1) browser_params[0]['headless'] = True #Launch browser headless browser_params[0]['http_instrument'] = True # Record HTTP Requests and Responses browser_params[0]['cookie_instrument'] = True # Records both JS cookies and HTTP response cookies to javascript_cookies manager_params['data_directory'] = './data/' manager_params['log_directory'] = './data/' manager = TaskManager.TaskManager(manager_params, browser_params) for site in sites: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.browse(num_links=5, sleep=10, timeout=360) command_sequence.dump_profile_cookies(120) manager.execute_command_sequence(command_sequence, index='**') manager.close()
CRAWL_DIRECTORY = os.getenv('CRAWL_DIRECTORY', 'crawl-data') S3_BUCKET = os.getenv('S3_BUCKET', 'openwpm-crawls') HTTP_INSTRUMENT = os.getenv('HTTP_INSTRUMENT', '1') == '1' COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1' NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1' JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1' JS_INSTRUMENT_MODULES = os.getenv('JS_INSTRUMENT_MODULES', None) SAVE_CONTENT = os.getenv('SAVE_CONTENT', '') DWELL_TIME = int(os.getenv('DWELL_TIME', '10')) TIMEOUT = int(os.getenv('TIMEOUT', '60')) SENTRY_DSN = os.getenv('SENTRY_DSN', None) LOGGER_SETTINGS = MPLogger.parse_config_from_env() # Loads the default manager params # and NUM_BROWSERS copies of the default browser params manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Browser configuration for i in range(NUM_BROWSERS): browser_params[i]['http_instrument'] = HTTP_INSTRUMENT browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT browser_params[i]['js_instrument'] = JS_INSTRUMENT if JS_INSTRUMENT_MODULES: browser_params[i]['js_instrument_modules'] = JS_INSTRUMENT_MODULES if SAVE_CONTENT == '1': browser_params[i]['save_content'] = True elif SAVE_CONTENT == '0': browser_params[i]['save_content'] = False else: browser_params[i]['save_content'] = SAVE_CONTENT
from automation import TaskManager, CommandSequence # The list of sites that we wish to crawl NUM_BROWSERS = 3 sites = ['http://www.example.com', 'http://www.princeton.edu', 'http://citp.princeton.edu/'] # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in xrange(NUM_BROWSERS): browser_params[i]['disable_flash'] = False #Enable flash for all three browsers browser_params[0]['headless'] = True #Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) command_sequence.get(sleep=0, timeout=60) command_sequence.dump_profile_cookies(120) manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
#!/usr/bin/python from automation import CommandSequence, TaskManager def run_custom_function(**kwargs): driver = kwargs['driver'] url_title = driver.title print("Title: %s" % url_title) return if __name__ == "__main__": url_list = ["https://google.com"] manager_params, browser_params = TaskManager.load_default_params(1) manager = TaskManager.TaskManager(manager_params, browser_params) for URL in url_list: cs = CommandSequence.CommandSequence(URL) cs.get(sleep=10, timeout=60) cs.run_custom_function(run_custom_function) manager.execute_command_sequence(cs) manager.close()
#manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) manager_params, browser_params = load_params() pass # Update browser configuration (use this for per-browser settings) for i in xrange(NUM_BROWSERS): browser_params[i]['disable_flash'] = False #Enable flash for all three browsers browser_params[0]['headless'] = False #Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/wpm' manager_params['log_directory'] = '~/Desktop/wpm' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) command_sequence.get(sleep=3, timeout=60) manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers import time time.sleep(3) time.sleep(30) # Shuts down the browsers and waits for the data to finish logging manager.close()
def analyze_sites(sites): # The list of sites that we wish to crawl NUM_BROWSERS = 2 #sites = [ # "https://www.cnn.com", # "https://www.tufts.edu" #] # Loads the default manager params # and NUM_BROWSERS copies of the default browser params manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]["http_instrument"] = True # Record cookie changes browser_params[i]["cookie_instrument"] = True # Record Navigations browser_params[i]["navigation_instrument"] = True # Record JS Web API calls browser_params[i]["js_instrument"] = True # Record the callstack of all WebRequests made browser_params[i]["callstack_instrument"] = True # Record DNS resolution browser_params[i]["dns_instrument"] = True # Launch only browser 0 headless browser_params[0]["display_mode"] = "headless" # Update TaskManager configuration (use this for crawl-wide settings) manager_params["data_directory"] = "~/Desktop/testing/" manager_params["log_directory"] = "~/Desktop/testing/" manager_params['output_format'] = 's3' manager_params['s3_bucket'] = 'ihavenothingtohide' manager_params['s3_directory'] = '2020-2' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence.CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.get(sleep=3, timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()
from automation import TaskManager, CommandSequence from automation.Errors import CommandExecutionError import crawl_utils import time import os # The list of sites that we wish to crawl NUM_BROWSERS = 15 NUM_BATCH = 5000 DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) date_prefix = 'XXX' # Updated by deployment script prefix = date_prefix + '_dom_chunk' manager_params['database_name'] = prefix + '.sqlite' manager_params['data_directory'] = '~/' + prefix manager_params['log_directory'] = '~/' + prefix # Read the site list sites = list() with open( os.path.join(os.path.dirname(__file__), 'data', 'sites_to_recrawl_DOM_chunk.txt')) as f: for line in f: if line.strip() == '': continue sites.append(tuple(line.strip().split(',', 2))) TOTAL_NUM_SITES = len(sites)
from pprint import pprint import json import time # The list of sites that we wish to crawl # files = os.listdir(os.path.join(os.path.dirname(__file__) ,'browser_settings')) # NUM_BROWSERS = 0 # for f in files: # if '.DS_Store' not in f: # NUM_BROWSERS = NUM_BROWSERS + 1 REPO = os.path.abspath(os.path.join(os.path.dirname(__file__), '.')) DB_DIR = os.path.join(REPO, 'db') # db_loc = DB_DIR + "/amazon-crawler.sqlite" product_urls = TaskManager.load_products() # print product_urls manager_params, browser_params =TaskManager.load_amazon_params() # for i in xrange(NUM_BROWSERS): # browser_params[i]['disable_flash'] = True # for i in xrange(NUM_BROWSERS): # browser_params[i]['disable_flash'] = True #debug # browser_params = [browser_params[0]] # manager = TaskManager.TaskManager(DB_DIR, browser_params, 1) # manager_params['data_directory'] = '~/OpenWPM/' manager = TaskManager.TaskManager(manager_params, browser_params,True)
def run_openwpm(sites, data_directory, run_id, data_base_name): """ run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db """ print 'number of passed typo candidates ', len(sites) NUM_BROWSERS = 3 try: print data_directory print run_id # Instantiates the measurement platform # Commands time out by default after 60 seconds picked_typo_candidates = set([]) # Visits the sites with all browsers simultaneously for typo_candidate in sites: picked_typo_candidates.add("http://" + typo_candidate) if len(picked_typo_candidates) % 400 == 399: time.sleep(10) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) except: #print ValueError pass
import tempfile import time import os import copy import json # The list of sites that we wish to crawl sites = [ 'http://www.example.com', 'https://princeton.edu', 'https://citp.princeton.edu/' ] # Creates a temporary directory, where we will save the crawl DB db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite' preferences = TaskManager.load_default_params() browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)] # Instantiates the measurement platform # Launches two (non-headless) Firefox instances which log data using mitmproxy # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(db_loc, browser_params, 3) # Visits the sites with both browsers simultaneously, 5 seconds between visits for site in sites: manager.get(site, index='**') # ** = synchronized browsers time.sleep(5) # Shuts down the browsers and waits for the data to finish logging manager.close()
from automation import TaskManager import tempfile import time import os import copy import json # The list of sites that we wish to crawl sites = ['http://www.example.com', 'https://princeton.edu', 'https://citp.princeton.edu/'] # Creates a temporary directory, where we will save the crawl DB db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite' preferences = TaskManager.load_default_params() browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)] # Instantiates the measurement platform # Launches two (non-headless) Firefox instances which log data using mitmproxy # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(db_loc, browser_params, 3) # Visits the sites with both browsers simultaneously, 5 seconds between visits for site in sites: manager.get(site, index='**') # ** = synchronized browsers time.sleep(5) # Shuts down the browsers and waits for the data to finish logging
import time import glob import sys import os FB_USERNAME = '' FB_PASSWORD = '' USERS = set() USERS.add('') if not os.path.exists(os.path.join(os.path.dirname(__file__), '../data')): os.mkdir(os.path.join(os.path.dirname(__file__), '../data')) db_loc = os.path.join(os.path.dirname(__file__), '../data/facebook.sqlite') browser_params = TaskManager.load_default_params(1) browser_params[0]['proxy'] = False # don't double crawl outdir = glob.glob('../data/fbfriends/*') crawled = set() for fname in outdir: crawled.add(fname.split('/')[-1][0:-8]) users = USERS.difference(crawled) print "len of users to crawl: " + str(len(users)) if len(users) == 0: print "No users to crawl, exiting..." sys.exit(0)