def main(): with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile: soup = BeautifulSoup(myfile.read(), 'lxml') links = [] with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile: for item in soup.find_all('a', attrs={'data-link': True}): if "data-link" in item.attrs: if ".html" in item['data-link']: outfile.write(item['data-link']) outfile.write("\n") links.append(item['data-link']) # Go and dump the source for each manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(links): command_sequence = CommandSequence.CommandSequence(link) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_ad_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def main(): pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$") wpm_db = "/home/jason/Desktop/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) native_ad_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/analysis' manager_params['log_directory'] = '~/Desktop/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(native_ad_links): if not pattern.match(link[1]): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("ads" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def init_manager(self): # Loads the manager preference and NUM_BROWSERS copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( self.num_browsers) assignments = self.randomized_assignments() self.block_assignments.append(assignments) # Update browser configuration (use this for per-browser settings) for i in range(self.num_browsers): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = False browser_params[i]['headless'] = True browser_params[i]['control'] = assignments[i] # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = self.data_directory manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds try: manager = TaskManager.TaskManager(manager_params, browser_params) except TypeError: raise Exception("Failed to start the manager") self.manager = manager
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000): NUM_BROWSERS = number_of_browsers SITES = ['http://' + x for x in cu.sample_top_sites( location=os.path.expanduser('~/Desktop/'), slices=[(10000, 0, 10000), (10000, 10000, slice_end)])] manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) for i in range(NUM_BROWSERS): browser_params[i]['cookie_instrument'] = True browser_params[i]['js_instrument'] = True browser_params[i]['save_javascript'] = True browser_params[i]['http_instrument'] = True browser_params[i]['headless'] = True browser_params[i]['disable_flash'] = False browser_params[i]['save_documents'] = True browser_params[i]['caching_disabled'] = True manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' count = 0 manager = TaskManager.TaskManager(manager_params, browser_params) for site in SITES[0:exit_crawl_after]: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=10, timeout=60) command_sequence.scroll_page() command_sequence.recursive_dump_page_source() manager.execute_command_sequence(command_sequence) count += 1 if count % 1000 == 0: print "Total crawled: ", count manager.close()
def main(): wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite" conn = sql.connect(wpm_db) cur = conn.cursor() cur.execute(SQL_Query) article_links = cur.fetchall() # Loads the manager preference and sthe default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(1) # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/NYT/analysis' manager_params['log_directory'] = '~/Desktop/NYT/analysis' manager = TaskManager.TaskManager(manager_params, browser_params) for idx, link in enumerate(article_links): print idx print link command_sequence = CommandSequence.CommandSequence(link[1]) command_sequence.get(sleep=0, timeout=180) command_sequence.dump_page_source("nyt_articles_" + str(idx), 120) manager.execute_command_sequence(command_sequence, index="**") manager.close()
def extract_via_openwpm(sites): """ Utilise the OpenWPM package to extract the browser parameters for provided sites """ print '########## OpenWPM (start) (Englehardt, 2016) ##########' # The number of browsers to use to extract the data num_of_browsers = 1 # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( num_of_browsers) # Update browser configuration (use this for per-browser settings) for i in xrange(num_of_browsers): browser_params[i][ 'disable_flash'] = False # Enable flash for all three browsers # browser_params[0]['headless'] = True #Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visit the sites for site in sites: manager.get(site, index='**') # ** = synchronized browsers # Shuts down the browsers and waits for the data to finish logging manager.close() print '########## OpenWPM (end) (Englehardt, 2016) ##########'
def dump_crawl(sites,profile_name): #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') # The list of sites that we wish to crawl print sites,profile_name NUM_BROWSERS = 1 #3 # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses #browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[i]['headless'] = True # Launch all and not only browser 0 headless browser_params[i]['js_instrument'] = True # browser_params[i]['save_javascript'] = True #browser_params[i]['random_attributes']=True browser_params[i]['cookie_instrument']=True # browser_params[i]['cp_instrument']=True # browser_params[i]['save_all_content']=True if 'load_name' in locals(): browser_params[i]['profile_tar']=load_name browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/OpenWPM/' manager_params['log_directory'] = '~/OpenWPM/' manager_params['database_name']= "persona.sqlite" # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for i in range(0,len(sites)): print sites[i] site=sites[i] command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=300) # index='**' synchronizes visits between the three browsers #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True) manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS)) time.sleep(2) # dump_profile_cookies/dump_flash_cookies closes the current tab. # dump stores history last cookies/sites only stored # os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') #command_sequence.dump_profile_cookies(120) #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout) # Shuts down the browsers and waits for the data to finish logging manager.close()
def callWPM(NUM_BROWSERS, siteslist): print("Thread-----------thread-------------thread-----") manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) browser_params[0]['http_instrument'] = True browser_params[0]['disable_flash'] = False browser_params[0]['headless'] = True manager_params['data_directory'] = '../database/requestdata2/' manager_params['log_directory'] = '../database/requestdata2/' manager = TaskManager.TaskManager(manager_params, browser_params) for site in siteslist: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.get(sleep=0, timeout=10) manager.execute_command_sequence(command_sequence, index='**') manager.close()
def run_demo(url): NUM_BROWSERS = 1 sites = [str(url)] # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = False browser_params[i]['js_instrument'] = True if platform != 'darwin': browser_params[0]['headless'] = True # Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = 'feature_extraction/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=60) # dump_profile_cookies/dump_flash_cookies closes the current tab. command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close()
from automation import CommandSequence, TaskManager import pandas as pd # Declare constants NUM_BROWSERS = 3 # Load a pd dataframe using the given csv file top100_df = pd.read_csv('top-1m.csv', header=None, nrows=100, usecols=[1]) # Load the default manager params and NUM_BROWSER copies of the default browser params manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Record cookie changes browser_params[i]['cookie_instrument'] = True # Record JS Web API calls browser_params[i]['js_instrument'] = True # Set true for ad blocking mode. Set false for vanilla mode. browser_params[i]['ublock-origin'] = False # Do not record the callstack of all WebRequests made browser_params[i]['callstack_instrument'] = False # Do not record Navigations browser_params[i]['navigation_instrument'] = False # Set the display quality to headful browser_params[i]['display mode'] = 'headful'
import sys from automation import TaskManager, CommandSequence # Load the sites of sites we we wish to crawl into a list # E.g. ['http://www.example.com', 'http://dataskydd.net'] sites = [line.rstrip('\n') for line in open('municipalities_final_urls.txt')] manager_params, browser_params = TaskManager.load_default_params(1) browser_params[0]['headless'] = True #Launch browser headless browser_params[0]['http_instrument'] = True # Record HTTP Requests and Responses browser_params[0]['cookie_instrument'] = True # Records both JS cookies and HTTP response cookies to javascript_cookies manager_params['data_directory'] = './data/' manager_params['log_directory'] = './data/' manager = TaskManager.TaskManager(manager_params, browser_params) for site in sites: command_sequence = CommandSequence.CommandSequence(site, reset=True) command_sequence.browse(num_links=5, sleep=10, timeout=360) command_sequence.dump_profile_cookies(120) manager.execute_command_sequence(command_sequence, index='**') manager.close()
from automation import TaskManager, CommandSequence # The list of sites that we wish to crawl NUM_BROWSERS = 3 sites = ['http://www.example.com', 'http://www.princeton.edu', 'http://citp.princeton.edu/'] # Loads the manager preference and 3 copies of the default browser dictionaries manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in xrange(NUM_BROWSERS): browser_params[i]['disable_flash'] = False #Enable flash for all three browsers browser_params[0]['headless'] = True #Launch only browser 0 headless # Update TaskManager configuration (use this for crawl-wide settings) manager_params['data_directory'] = '~/Desktop/' manager_params['log_directory'] = '~/Desktop/' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites with all browsers simultaneously for site in sites: command_sequence = CommandSequence.CommandSequence(site) command_sequence.get(sleep=0, timeout=60) command_sequence.dump_profile_cookies(120) manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
#!/usr/bin/python from automation import CommandSequence, TaskManager def run_custom_function(**kwargs): driver = kwargs['driver'] url_title = driver.title print("Title: %s" % url_title) return if __name__ == "__main__": url_list = ["https://google.com"] manager_params, browser_params = TaskManager.load_default_params(1) manager = TaskManager.TaskManager(manager_params, browser_params) for URL in url_list: cs = CommandSequence.CommandSequence(URL) cs.get(sleep=10, timeout=60) cs.run_custom_function(run_custom_function) manager.execute_command_sequence(cs) manager.close()
def analyze_sites(sites): # The list of sites that we wish to crawl NUM_BROWSERS = 2 #sites = [ # "https://www.cnn.com", # "https://www.tufts.edu" #] # Loads the default manager params # and NUM_BROWSERS copies of the default browser params manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]["http_instrument"] = True # Record cookie changes browser_params[i]["cookie_instrument"] = True # Record Navigations browser_params[i]["navigation_instrument"] = True # Record JS Web API calls browser_params[i]["js_instrument"] = True # Record the callstack of all WebRequests made browser_params[i]["callstack_instrument"] = True # Record DNS resolution browser_params[i]["dns_instrument"] = True # Launch only browser 0 headless browser_params[0]["display_mode"] = "headless" # Update TaskManager configuration (use this for crawl-wide settings) manager_params["data_directory"] = "~/Desktop/testing/" manager_params["log_directory"] = "~/Desktop/testing/" manager_params['output_format'] = 's3' manager_params['s3_bucket'] = 'ihavenothingtohide' manager_params['s3_directory'] = '2020-2' # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence.CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.get(sleep=3, timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()
import tempfile import time import os import copy import json # The list of sites that we wish to crawl sites = [ 'http://www.example.com', 'https://princeton.edu', 'https://citp.princeton.edu/' ] # Creates a temporary directory, where we will save the crawl DB db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite' preferences = TaskManager.load_default_params() browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)] # Instantiates the measurement platform # Launches two (non-headless) Firefox instances which log data using mitmproxy # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(db_loc, browser_params, 3) # Visits the sites with both browsers simultaneously, 5 seconds between visits for site in sites: manager.get(site, index='**') # ** = synchronized browsers time.sleep(5) # Shuts down the browsers and waits for the data to finish logging manager.close()
def run_openwpm(sites, data_directory, run_id, data_base_name): """ run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db """ print 'number of passed typo candidates ', len(sites) NUM_BROWSERS = 3 try: print data_directory print run_id # Instantiates the measurement platform # Commands time out by default after 60 seconds picked_typo_candidates = set([]) # Visits the sites with all browsers simultaneously for typo_candidate in sites: picked_typo_candidates.add("http://" + typo_candidate) if len(picked_typo_candidates) % 400 == 399: time.sleep(10) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) manager_params, browser_params = TaskManager.load_default_params( NUM_BROWSERS) # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i]['http_instrument'] = True # Enable flash for all three browsers browser_params[i]['disable_flash'] = True browser_params[0][ 'headless'] = True # Launch only browser 0 headless manager_params['data_directory'] = data_directory manager_params['log_directory'] = data_directory manager_params['run_id'] = run_id manager_params['database_name'] = data_base_name manager = TaskManager.TaskManager(manager_params, browser_params) for site in picked_typo_candidates: command_sequence = CommandSequence.CommandSequence(site) # Start by visiting the page command_sequence.get(sleep=0, timeout=30) # dump_profile_cookies/dump_flash_cookies closes the current tab. #command_sequence.dump_profile_cookies(120) # index='**' synchronizes visits between the three browsers manager.execute_command_sequence(command_sequence, index='**') # Shuts down the browsers and waits for the data to finish logging manager.close() picked_typo_candidates = set([]) except: #print ValueError pass
from automation import TaskManager import tempfile import time import os import copy import json # The list of sites that we wish to crawl sites = ['http://www.example.com', 'https://princeton.edu', 'https://citp.princeton.edu/'] # Creates a temporary directory, where we will save the crawl DB db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite' preferences = TaskManager.load_default_params() browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)] # Instantiates the measurement platform # Launches two (non-headless) Firefox instances which log data using mitmproxy # Commands time out by default after 60 seconds manager = TaskManager.TaskManager(db_loc, browser_params, 3) # Visits the sites with both browsers simultaneously, 5 seconds between visits for site in sites: manager.get(site, index='**') # ** = synchronized browsers time.sleep(5) # Shuts down the browsers and waits for the data to finish logging