Example #1
def main():
    with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile:
        soup = BeautifulSoup(myfile.read(), 'lxml')
        links = []
        with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile:
            for item in soup.find_all('a', attrs={'data-link': True}):
                if "data-link" in item.attrs:
                    if ".html" in item['data-link']:

    # Go and dump the source for each
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(links):
        command_sequence = CommandSequence.CommandSequence(link)
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_ad_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

Example #2
def main():
    pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$")

    wpm_db = "/home/jason/Desktop/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    native_ad_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/analysis'
    manager_params['log_directory'] = '~/Desktop/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(native_ad_links):
        if not pattern.match(link[1]):
            print idx
            print link
            command_sequence = CommandSequence.CommandSequence(link[1])
            command_sequence.get(sleep=0, timeout=180)
            command_sequence.dump_page_source("ads" + str(idx), 120)
            manager.execute_command_sequence(command_sequence, index="**")

Example #3
    def init_manager(self):
        # Loads the manager preference and NUM_BROWSERS copies of the default browser dictionaries
        manager_params, browser_params = TaskManager.load_default_params(
        assignments = self.randomized_assignments()

        # Update browser configuration (use this for per-browser settings)
        for i in range(self.num_browsers):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = False
            browser_params[i]['headless'] = True
            browser_params[i]['control'] = assignments[i]

        # Update TaskManager configuration (use this for crawl-wide settings)
        manager_params['data_directory'] = self.data_directory
        manager_params['log_directory'] = '~/Desktop/'

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
            manager = TaskManager.TaskManager(manager_params, browser_params)
        except TypeError:
            raise Exception("Failed to start the manager")
        self.manager = manager
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000):
    NUM_BROWSERS = number_of_browsers
    SITES = ['http://' + x for x in cu.sample_top_sites(
                                slices=[(10000, 0, 10000), (10000, 10000, slice_end)])]

    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    for i in range(NUM_BROWSERS):
        browser_params[i]['cookie_instrument'] = True
        browser_params[i]['js_instrument'] = True
        browser_params[i]['save_javascript'] = True
        browser_params[i]['http_instrument'] = True
        browser_params[i]['headless'] = True
        browser_params[i]['disable_flash'] = False
        browser_params[i]['save_documents'] = True
        browser_params[i]['caching_disabled'] = True

    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    count = 0
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for site in SITES[0:exit_crawl_after]:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=10, timeout=60)
        count += 1
        if count % 1000 == 0:
            print "Total crawled: ", count
Example #5
def main():

    wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    article_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(article_links):
        print idx
        print link
        command_sequence = CommandSequence.CommandSequence(link[1])
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_articles_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

Example #6
def extract_via_openwpm(sites):
    Utilise the OpenWPM package to extract the browser parameters for 
    provided sites

    print '########## OpenWPM (start) (Englehardt, 2016) ##########'
    # The number of browsers to use to extract the data
    num_of_browsers = 1

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(

    # Update browser configuration (use this for per-browser settings)
    for i in xrange(num_of_browsers):
            'disable_flash'] = False  # Enable flash for all three browsers
    # browser_params[0]['headless'] = True #Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visit the sites
    for site in sites:
        manager.get(site, index='**')  # ** = synchronized browsers

    # Shuts down the browsers and waits for the data to finish logging
    print '########## OpenWPM (end) (Englehardt, 2016) ##########'
Example #7
def dump_crawl(sites,profile_name):
    #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    # The list of sites that we wish to crawl
    print sites,profile_name
    NUM_BROWSERS = 1 #3
    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        #browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = True
        browser_params[i]['headless'] = True  # Launch all and not only browser 0 headless
        browser_params[i]['js_instrument'] = True
#        browser_params[i]['save_javascript'] = True
     #   browser_params[i]['cp_instrument']=True
#        browser_params[i]['save_all_content']=True
        if 'load_name' in locals():
    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/OpenWPM/'
    manager_params['log_directory'] = '~/OpenWPM/'
    manager_params['database_name']= "persona.sqlite"


    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for i in range(0,len(sites)):
        print sites[i]
        command_sequence = CommandSequence.CommandSequence(site)
        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=300)
        # index='**' synchronizes visits between the three browsers
    	#command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True)
    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    # dump stores history last cookies/sites only stored 
    #    os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout)
	# Shuts down the browsers and waits for the data to finish logging
Example #8
def callWPM(NUM_BROWSERS, siteslist):
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
    browser_params[0]['http_instrument'] = True
    browser_params[0]['disable_flash'] = False
    browser_params[0]['headless'] = True
    manager_params['data_directory'] = '../database/requestdata2/'
    manager_params['log_directory'] = '../database/requestdata2/'
    manager = TaskManager.TaskManager(manager_params, browser_params)
    for site in siteslist:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=0, timeout=10)
        manager.execute_command_sequence(command_sequence, index='**')
Example #9
def run_demo(url):
    sites = [str(url)]

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = False
        browser_params[i]['js_instrument'] = True
    if platform != 'darwin':
        browser_params[0]['headless'] = True  # Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = 'feature_extraction/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for site in sites:
        command_sequence = CommandSequence.CommandSequence(site)

        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=60)

        # dump_profile_cookies/dump_flash_cookies closes the current tab.

        # index='**' synchronizes visits between the three browsers
        manager.execute_command_sequence(command_sequence, index='**')

    # Shuts down the browsers and waits for the data to finish logging
Example #10
from automation import CommandSequence, TaskManager
import pandas as pd

# Declare constants

# Load a pd dataframe using the given csv file
top100_df = pd.read_csv('top-1m.csv', header=None, nrows=100, usecols=[1])

# Load the default manager params and NUM_BROWSER copies of the default browser params
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i]['http_instrument'] = True
    # Record cookie changes
    browser_params[i]['cookie_instrument'] = True
    # Record JS Web API calls
    browser_params[i]['js_instrument'] = True

    # Set true for ad blocking mode. Set false for vanilla mode.
    browser_params[i]['ublock-origin'] = False

    # Do not record the callstack of all WebRequests made
    browser_params[i]['callstack_instrument'] = False
    # Do not record Navigations
    browser_params[i]['navigation_instrument'] = False

    # Set the display quality to headful
    browser_params[i]['display mode'] = 'headful'
import sys
from automation import TaskManager, CommandSequence

# Load the sites of sites we we wish to crawl into a list
# E.g. ['http://www.example.com', 'http://dataskydd.net']
sites = [line.rstrip('\n') for line in open('municipalities_final_urls.txt')]

manager_params, browser_params = TaskManager.load_default_params(1)

browser_params[0]['headless'] = True #Launch browser headless
browser_params[0]['http_instrument'] = True # Record HTTP Requests and Responses
browser_params[0]['cookie_instrument'] = True # Records both JS cookies and HTTP response cookies to javascript_cookies

manager_params['data_directory'] = './data/'
manager_params['log_directory'] = './data/'

manager = TaskManager.TaskManager(manager_params, browser_params)

for site in sites:
    command_sequence = CommandSequence.CommandSequence(site, reset=True)
    command_sequence.browse(num_links=5, sleep=10, timeout=360)
    manager.execute_command_sequence(command_sequence, index='**')

Example #12
from automation import TaskManager, CommandSequence

# The list of sites that we wish to crawl
sites = ['http://www.example.com',

# Loads the manager preference and 3 copies of the default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in xrange(NUM_BROWSERS):
    browser_params[i]['disable_flash'] = False #Enable flash for all three browsers
browser_params[0]['headless'] = True #Launch only browser 0 headless

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/'
manager_params['log_directory'] = '~/Desktop/'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites:
    command_sequence = CommandSequence.CommandSequence(site)
    command_sequence.get(sleep=0, timeout=60)
    manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
Example #13

from automation import CommandSequence, TaskManager

def run_custom_function(**kwargs):
    driver = kwargs['driver']
    url_title = driver.title
    print("Title: %s" % url_title)

if __name__ == "__main__":
    url_list = ["https://google.com"]

    manager_params, browser_params = TaskManager.load_default_params(1)
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for URL in url_list:
        cs = CommandSequence.CommandSequence(URL)
        cs.get(sleep=10, timeout=60)

Example #14
def analyze_sites(sites):
    # The list of sites that we wish to crawl
    #sites = [
    #    "https://www.cnn.com",
    #    "https://www.tufts.edu"

    # Loads the default manager params
    # and NUM_BROWSERS copies of the default browser params
    manager_params, browser_params = TaskManager.load_default_params(

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]["http_instrument"] = True
        # Record cookie changes
        browser_params[i]["cookie_instrument"] = True
        # Record Navigations
        browser_params[i]["navigation_instrument"] = True
        # Record JS Web API calls
        browser_params[i]["js_instrument"] = True
        # Record the callstack of all WebRequests made
        browser_params[i]["callstack_instrument"] = True
        # Record DNS resolution
        browser_params[i]["dns_instrument"] = True

    # Launch only browser 0 headless
    browser_params[0]["display_mode"] = "headless"

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params["data_directory"] = "~/Desktop/testing/"
    manager_params["log_directory"] = "~/Desktop/testing/"

    manager_params['output_format'] = 's3'
    manager_params['s3_bucket'] = 'ihavenothingtohide'
    manager_params['s3_directory'] = '2020-2'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites
    for site in sites:

        # Parallelize sites over all number of browsers set above.
        command_sequence = CommandSequence.CommandSequence(
            callback=lambda success, val=site: print("CommandSequence {} done".

        # Start by visiting the page
        command_sequence.get(sleep=3, timeout=60)

        # Run commands across the three browsers (simple parallelization)

    # Shuts down the browsers and waits for the data to finish logging
Example #15
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = [
    'http://www.example.com', 'https://princeton.edu',

# Creates a temporary directory, where we will save the crawl DB
db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()

browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**')  # ** = synchronized browsers

# Shuts down the browsers and waits for the data to finish logging
Example #16
def run_openwpm(sites, data_directory, run_id, data_base_name):
    run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db
    print 'number of passed typo candidates ', len(sites)

        print data_directory
        print run_id

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        picked_typo_candidates = set([])
        # Visits the sites with all browsers simultaneously
        for typo_candidate in sites:
            picked_typo_candidates.add("http://" + typo_candidate)
            if len(picked_typo_candidates) % 400 == 399:
                manager_params, browser_params = TaskManager.load_default_params(

                # Update browser configuration (use this for per-browser settings)
                for i in range(NUM_BROWSERS):
                    # Record HTTP Requests and Responses
                    browser_params[i]['http_instrument'] = True
                    # Enable flash for all three browsers
                    browser_params[i]['disable_flash'] = True
                        'headless'] = True  # Launch only browser 0 headless
                manager_params['data_directory'] = data_directory
                manager_params['log_directory'] = data_directory
                manager_params['run_id'] = run_id
                manager_params['database_name'] = data_base_name
                manager = TaskManager.TaskManager(manager_params,
                for site in picked_typo_candidates:
                    command_sequence = CommandSequence.CommandSequence(site)

                    # Start by visiting the page
                    command_sequence.get(sleep=0, timeout=30)

                    # dump_profile_cookies/dump_flash_cookies closes the current tab.

                    # index='**' synchronizes visits between the three browsers

                # Shuts down the browsers and waits for the data to finish logging
                picked_typo_candidates = set([])

        manager_params, browser_params = TaskManager.load_default_params(

        # Update browser configuration (use this for per-browser settings)
        for i in range(NUM_BROWSERS):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = True
                'headless'] = True  # Launch only browser 0 headless
        manager_params['data_directory'] = data_directory
        manager_params['log_directory'] = data_directory
        manager_params['run_id'] = run_id
        manager_params['database_name'] = data_base_name
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in picked_typo_candidates:
            command_sequence = CommandSequence.CommandSequence(site)

            # Start by visiting the page
            command_sequence.get(sleep=0, timeout=30)

            # dump_profile_cookies/dump_flash_cookies closes the current tab.

            # index='**' synchronizes visits between the three browsers
            manager.execute_command_sequence(command_sequence, index='**')

        # Shuts down the browsers and waits for the data to finish logging
        picked_typo_candidates = set([])
        #print ValueError
Example #17
from automation import TaskManager
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = ['http://www.example.com',

# Creates a temporary directory, where we will save the crawl DB
db_loc  = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()

browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**') # ** = synchronized browsers

# Shuts down the browsers and waits for the data to finish logging