Example #1
0
def main():
    pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$")

    wpm_db = "/home/jason/Desktop/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    native_ad_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/analysis'
    manager_params['log_directory'] = '~/Desktop/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(native_ad_links):
        if not pattern.match(link[1]):
            print idx
            print link
            command_sequence = CommandSequence.CommandSequence(link[1])
            command_sequence.get(sleep=0, timeout=180)
            command_sequence.dump_page_source("ads" + str(idx), 120)
            manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
Example #2
0
def extract_via_openwpm(sites):
    """
    Utilise the OpenWPM package to extract the browser parameters for 
    provided sites
    """

    print '########## OpenWPM (start) (Englehardt, 2016) ##########'
    # The number of browsers to use to extract the data
    num_of_browsers = 1

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(
        num_of_browsers)

    # Update browser configuration (use this for per-browser settings)
    for i in xrange(num_of_browsers):
        browser_params[i][
            'disable_flash'] = False  # Enable flash for all three browsers
    # browser_params[0]['headless'] = True #Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visit the sites
    for site in sites:
        manager.get(site, index='**')  # ** = synchronized browsers

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
    print '########## OpenWPM (end) (Englehardt, 2016) ##########'
Example #3
0
    def init_manager(self):
        # Loads the manager preference and NUM_BROWSERS copies of the default browser dictionaries
        manager_params, browser_params = TaskManager.load_default_params(
            self.num_browsers)
        assignments = self.randomized_assignments()
        self.block_assignments.append(assignments)

        # Update browser configuration (use this for per-browser settings)
        for i in range(self.num_browsers):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = False
            browser_params[i]['headless'] = True
            browser_params[i]['control'] = assignments[i]

        # Update TaskManager configuration (use this for crawl-wide settings)
        manager_params['data_directory'] = self.data_directory
        manager_params['log_directory'] = '~/Desktop/'

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        try:
            manager = TaskManager.TaskManager(manager_params, browser_params)
        except TypeError:
            raise Exception("Failed to start the manager")
        self.manager = manager
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000):
    NUM_BROWSERS = number_of_browsers
    SITES = ['http://' + x for x in cu.sample_top_sites(
                                location=os.path.expanduser('~/Desktop/'), 
                                slices=[(10000, 0, 10000), (10000, 10000, slice_end)])]

    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    for i in range(NUM_BROWSERS):
        browser_params[i]['cookie_instrument'] = True
        browser_params[i]['js_instrument'] = True
        browser_params[i]['save_javascript'] = True
        browser_params[i]['http_instrument'] = True
        browser_params[i]['headless'] = True
        browser_params[i]['disable_flash'] = False
        browser_params[i]['save_documents'] = True
        browser_params[i]['caching_disabled'] = True

    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    count = 0
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for site in SITES[0:exit_crawl_after]:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=10, timeout=60)
        command_sequence.scroll_page()
        command_sequence.recursive_dump_page_source()
        manager.execute_command_sequence(command_sequence)
    
        count += 1
        if count % 1000 == 0:
            print "Total crawled: ", count
    manager.close()
Example #5
0
def main():
    with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile:
        soup = BeautifulSoup(myfile.read(), 'lxml')
        links = []
        with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile:
            for item in soup.find_all('a', attrs={'data-link': True}):
                if "data-link" in item.attrs:
                    if ".html" in item['data-link']:
                        outfile.write(item['data-link'])
                        outfile.write("\n")
                        links.append(item['data-link'])

    # Go and dump the source for each
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(links):
        command_sequence = CommandSequence.CommandSequence(link)
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_ad_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
Example #6
0
def main():

    wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    article_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(article_links):
        print idx
        print link
        command_sequence = CommandSequence.CommandSequence(link[1])
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_articles_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
Example #7
0
def dump_crawl(sites,profile_name):
    #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    # The list of sites that we wish to crawl
    print sites,profile_name
    NUM_BROWSERS = 1 #3
    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        #browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = True
        browser_params[i]['headless'] = True  # Launch all and not only browser 0 headless
        browser_params[i]['js_instrument'] = True
#        browser_params[i]['save_javascript'] = True
        #browser_params[i]['random_attributes']=True
        browser_params[i]['cookie_instrument']=True
     #   browser_params[i]['cp_instrument']=True
#        browser_params[i]['save_all_content']=True
        if 'load_name' in locals():
            browser_params[i]['profile_tar']=load_name
        browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name
        
    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/OpenWPM/'
    manager_params['log_directory'] = '~/OpenWPM/'
    manager_params['database_name']= "persona.sqlite"

   

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for i in range(0,len(sites)):
        print sites[i]
        site=sites[i]    
        command_sequence = CommandSequence.CommandSequence(site)
        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=300)
        # index='**' synchronizes visits between the three browsers
        #command_sequence.dump_profile_cookies(120)
    	#command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True)
	manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS))
	time.sleep(2)
    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    # dump stores history last cookies/sites only stored 
    #    os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    #command_sequence.dump_profile_cookies(120)
    #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout)
	# Shuts down the browsers and waits for the data to finish logging
    manager.close()
Example #8
0
def callWPM(NUM_BROWSERS, siteslist):
    print("Thread-----------thread-------------thread-----")
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
    browser_params[0]['http_instrument'] = True
    browser_params[0]['disable_flash'] = False
    browser_params[0]['headless'] = True
    manager_params['data_directory'] = '../database/requestdata2/'
    manager_params['log_directory'] = '../database/requestdata2/'
    manager = TaskManager.TaskManager(manager_params, browser_params)
    for site in siteslist:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=0, timeout=10)
        manager.execute_command_sequence(command_sequence, index='**')
    manager.close()
Example #9
0
    def run_search_google_training_by_multiple_commands(self, tmpdir):
        """visit all the training site. each visit is a single command
        """
        # get the size of training sites, and visit one by one using index in their list;
        # this is to avoid the problem of there is error when visit one site could stop whole
        # visiting process (in case of using single CommandSequence);
        # all the browser must have the same number of visting site
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS)
        with open(browser_params[0]['training_keywords']) as _f:
            _sites = [site for site in _f]
        nu_sites = len(_sites)
        cs = CommandSequence.CommandSequence("http://www.example.com")
        #cs2 = CommandSequence.CommandSequence("none") # url is a place holder
        #cs.get(sleep=3)
        #cs2.login_google()
        #manager.execute_command_sequence(cs2, index="**")

        for i in range(0, nu_sites):
            cs.single_search_google_shopping_by_index(i, -1, training=True)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(5)
        manager.close()
        print("finish....")
Example #10
0
 def _stateless_crawl(self, sites):
     '''Performs a crawl with sites providing login'''
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     for site in sites:
         params = self._fetch_params(site)
         commandseq = CommandSequence.CommandSequence(site, reset=True)
         commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT)
         commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT)
         manager.execute_command_sequence(commandseq, index=None)
     manager.close()
Example #11
0
def run_demo(url):
    NUM_BROWSERS = 1
    sites = [str(url)]

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(
        NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = False
        browser_params[i]['js_instrument'] = True
    if platform != 'darwin':
        browser_params[0]['headless'] = True  # Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = 'feature_extraction/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for site in sites:
        command_sequence = CommandSequence.CommandSequence(site)

        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=60)

        # dump_profile_cookies/dump_flash_cookies closes the current tab.
        command_sequence.dump_profile_cookies(120)

        # index='**' synchronizes visits between the three browsers
        manager.execute_command_sequence(command_sequence, index='**')

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
def run_site_crawl(db_path, sites, preferences, dump_location):
    """
    runs the crawl itself
    <db_path> is the absolute path of crawl database
    <preferences> is a dictionary of preferences to initialize the crawler
    """
    manager = TaskManager.TaskManager(db_path, preferences, 1)

    for site in sites:
        manager.get(site)

    if dump_location:
        manager.dump_profile(dump_location, True)

    manager.close()
Example #13
0
    def browser_training_site(self, tmpdir):
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS)
        with open(browser_params[0]['training_sites']) as _f:
            _sites = [site for site in _f]
        nu_sites = len(_sites)
        cs = CommandSequence.CommandSequence("http://www.example.com")
        #cs.get()
        for i in range(len(_sites)):
            cs.browse_site_by_index(i, 3)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(10)
        manager.close()
Example #14
0
 def crawl(self, sites):
     '''Runs crawl resulting in dataset for unsupervised tracking detection
     Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de'''
     self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE)
     self.browserpar['disable_flash'] = True
     for _ in range(0, self.NUM_USERS):
         manager = TaskManager.TaskManager(self.managerpar,
                                           [self.browserpar])
         for site in sites:
             for _ in range(0, self.NUM_VISITS):
                 command_sequence = CommandSequence.CommandSequence(site)
                 command_sequence.get(sleep=self.DEF_SLEEP,
                                      timeout=self.DEF_TIMEOUT)
                 manager.execute_command_sequence(command_sequence,
                                                  index=None)
         manager.close()
Example #15
0
 def crawl(self, sites):
     '''Runs a crawl to measure various metrics regarding third-party tracking.
        Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de'''
     self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE)
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     for site in sites:
         # we run a stateless crawl (fresh profile for each page)
         command_sequence = CommandSequence.CommandSequence(site,
                                                            reset=True)
         # Start by visiting the page
         command_sequence.get(sleep=self.DEF_SLEEP,
                              timeout=self.DEF_TIMEOUT)
         # dump_profile_cookies/dump_flash_cookies closes the current tab.
         command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME)
         command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME)
         manager.execute_command_sequence(command_sequence, index=None)
     manager.close()
Example #16
0
    def test(self, tmpdir):
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs = CommandSequence.CommandSequence("http://www.google.com")
        cs.get(sleep=3)
        #cs.get()
        #cs.login_google()
        #cs.search_google_shopping()
        #cs.single_search_google_shopping("food",training=False)
        #time.sleep(2)
        #cs.single_search_google_shopping("baby powder", number of link, trainig...)
        cs.multiple_search_google_shopping(-1, training=False, sleep_time=2)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(15)
        manager.close(post_process=False)
        print("finish....")
Example #17
0
 def _statefull_crawl(self, loginsite, sites):
     '''Performs crawl by logging into one site and regularly crawling others'''
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     # login to given page
     params = self._fetch_params(loginsite)
     commandseq = CommandSequence.CommandSequence(loginsite)
     commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT)
     commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT)
     manager.execute_command_sequence(commandseq, index=None)
     # proceed to crawl pages
     for site in sites:
         # we run a stateless crawl (fresh profile for each page)
         command_sequence = CommandSequence.CommandSequence(site)
         # Start by visiting the page
         command_sequence.get(sleep=self.DEF_SLEEP,
                              timeout=self.DEF_TIMEOUT)
         # dump_profile_cookies/dump_flash_cookies closes the current tab.
         command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME)
         command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME)
         manager.execute_command_sequence(command_sequence, index=None)
     manager.close()
Example #18
0
from automation import CommandSequence, TaskManager
import pandas as pd

# Declare constants
NUM_BROWSERS = 3

# Load a pd dataframe using the given csv file
top100_df = pd.read_csv('top-1m.csv', header=None, nrows=100, usecols=[1])

# Load the default manager params and NUM_BROWSER copies of the default browser params
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i]['http_instrument'] = True
    # Record cookie changes
    browser_params[i]['cookie_instrument'] = True
    # Record JS Web API calls
    browser_params[i]['js_instrument'] = True

    # Set true for ad blocking mode. Set false for vanilla mode.
    browser_params[i]['ublock-origin'] = False

    # Do not record the callstack of all WebRequests made
    browser_params[i]['callstack_instrument'] = False
    # Do not record Navigations
    browser_params[i]['navigation_instrument'] = False

    # Set the display quality to headful
    browser_params[i]['display mode'] = 'headful'
import sys
from automation import TaskManager, CommandSequence

# Load the sites of sites we we wish to crawl into a list
# E.g. ['http://www.example.com', 'http://dataskydd.net']
sites = [line.rstrip('\n') for line in open('municipalities_final_urls.txt')]

manager_params, browser_params = TaskManager.load_default_params(1)

browser_params[0]['headless'] = True #Launch browser headless
browser_params[0]['http_instrument'] = True # Record HTTP Requests and Responses
browser_params[0]['cookie_instrument'] = True # Records both JS cookies and HTTP response cookies to javascript_cookies

manager_params['data_directory'] = './data/'
manager_params['log_directory'] = './data/'

manager = TaskManager.TaskManager(manager_params, browser_params)

for site in sites:
    command_sequence = CommandSequence.CommandSequence(site, reset=True)
    command_sequence.browse(num_links=5, sleep=10, timeout=360)
    command_sequence.dump_profile_cookies(120)
    manager.execute_command_sequence(command_sequence, index='**')

manager.close()
Example #20
0
CRAWL_DIRECTORY = os.getenv('CRAWL_DIRECTORY', 'crawl-data')
S3_BUCKET = os.getenv('S3_BUCKET', 'openwpm-crawls')
HTTP_INSTRUMENT = os.getenv('HTTP_INSTRUMENT', '1') == '1'
COOKIE_INSTRUMENT = os.getenv('COOKIE_INSTRUMENT', '1') == '1'
NAVIGATION_INSTRUMENT = os.getenv('NAVIGATION_INSTRUMENT', '1') == '1'
JS_INSTRUMENT = os.getenv('JS_INSTRUMENT', '1') == '1'
JS_INSTRUMENT_MODULES = os.getenv('JS_INSTRUMENT_MODULES', None)
SAVE_CONTENT = os.getenv('SAVE_CONTENT', '')
DWELL_TIME = int(os.getenv('DWELL_TIME', '10'))
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
SENTRY_DSN = os.getenv('SENTRY_DSN', None)
LOGGER_SETTINGS = MPLogger.parse_config_from_env()

# Loads the default manager params
# and NUM_BROWSERS copies of the default browser params
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Browser configuration
for i in range(NUM_BROWSERS):
    browser_params[i]['http_instrument'] = HTTP_INSTRUMENT
    browser_params[i]['cookie_instrument'] = COOKIE_INSTRUMENT
    browser_params[i]['navigation_instrument'] = NAVIGATION_INSTRUMENT
    browser_params[i]['js_instrument'] = JS_INSTRUMENT
    if JS_INSTRUMENT_MODULES:
        browser_params[i]['js_instrument_modules'] = JS_INSTRUMENT_MODULES
    if SAVE_CONTENT == '1':
        browser_params[i]['save_content'] = True
    elif SAVE_CONTENT == '0':
        browser_params[i]['save_content'] = False
    else:
        browser_params[i]['save_content'] = SAVE_CONTENT
Example #21
0
from automation import TaskManager, CommandSequence

# The list of sites that we wish to crawl
NUM_BROWSERS = 3
sites = ['http://www.example.com',
         'http://www.princeton.edu',
         'http://citp.princeton.edu/']

# Loads the manager preference and 3 copies of the default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in xrange(NUM_BROWSERS):
    browser_params[i]['disable_flash'] = False #Enable flash for all three browsers
browser_params[0]['headless'] = True #Launch only browser 0 headless

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/'
manager_params['log_directory'] = '~/Desktop/'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites:
    command_sequence = CommandSequence.CommandSequence(site)
    command_sequence.get(sleep=0, timeout=60)
    command_sequence.dump_profile_cookies(120)
    manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
Example #22
0
#!/usr/bin/python

from automation import CommandSequence, TaskManager


def run_custom_function(**kwargs):
    driver = kwargs['driver']
    url_title = driver.title
    print("Title: %s" % url_title)
    return


if __name__ == "__main__":
    url_list = ["https://google.com"]

    manager_params, browser_params = TaskManager.load_default_params(1)
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for URL in url_list:
        cs = CommandSequence.CommandSequence(URL)
        cs.get(sleep=10, timeout=60)
        cs.run_custom_function(run_custom_function)
        manager.execute_command_sequence(cs)

    manager.close()
Example #23
0
#manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
manager_params, browser_params = load_params()

pass

# Update browser configuration (use this for per-browser settings)
for i in xrange(NUM_BROWSERS):
    browser_params[i]['disable_flash'] = False #Enable flash for all three browsers
browser_params[0]['headless'] = False #Launch only browser 0 headless

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/wpm'
manager_params['log_directory'] = '~/Desktop/wpm'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites:
    command_sequence = CommandSequence.CommandSequence(site)

    command_sequence.get(sleep=3, timeout=60)
    manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
    import time
    time.sleep(3)
time.sleep(30)
# Shuts down the browsers and waits for the data to finish logging
manager.close()
Example #24
0
def analyze_sites(sites):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 2
    #sites = [
    #    "https://www.cnn.com",
    #    "https://www.tufts.edu"
    #]

    # Loads the default manager params
    # and NUM_BROWSERS copies of the default browser params
    manager_params, browser_params = TaskManager.load_default_params(
        NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]["http_instrument"] = True
        # Record cookie changes
        browser_params[i]["cookie_instrument"] = True
        # Record Navigations
        browser_params[i]["navigation_instrument"] = True
        # Record JS Web API calls
        browser_params[i]["js_instrument"] = True
        # Record the callstack of all WebRequests made
        browser_params[i]["callstack_instrument"] = True
        # Record DNS resolution
        browser_params[i]["dns_instrument"] = True

    # Launch only browser 0 headless
    browser_params[0]["display_mode"] = "headless"

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params["data_directory"] = "~/Desktop/testing/"
    manager_params["log_directory"] = "~/Desktop/testing/"

    manager_params['output_format'] = 's3'
    manager_params['s3_bucket'] = 'ihavenothingtohide'
    manager_params['s3_directory'] = '2020-2'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites
    for site in sites:

        # Parallelize sites over all number of browsers set above.
        command_sequence = CommandSequence.CommandSequence(
            site,
            reset=True,
            callback=lambda success, val=site: print("CommandSequence {} done".
                                                     format(val)),
        )

        # Start by visiting the page
        command_sequence.get(sleep=3, timeout=60)

        # Run commands across the three browsers (simple parallelization)
        manager.execute_command_sequence(command_sequence)

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
Example #25
0
from automation import TaskManager, CommandSequence
from automation.Errors import CommandExecutionError
import crawl_utils
import time
import os

# The list of sites that we wish to crawl
NUM_BROWSERS = 15
NUM_BATCH = 5000
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')

manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

date_prefix = 'XXX'  # Updated by deployment script
prefix = date_prefix + '_dom_chunk'
manager_params['database_name'] = prefix + '.sqlite'
manager_params['data_directory'] = '~/' + prefix
manager_params['log_directory'] = '~/' + prefix

# Read the site list
sites = list()
with open(
        os.path.join(os.path.dirname(__file__), 'data',
                     'sites_to_recrawl_DOM_chunk.txt')) as f:
    for line in f:
        if line.strip() == '':
            continue
        sites.append(tuple(line.strip().split(',', 2)))

TOTAL_NUM_SITES = len(sites)
Example #26
0
from pprint import pprint 
import json
import time
# The list of sites that we wish to crawl
# files = os.listdir(os.path.join(os.path.dirname(__file__) ,'browser_settings'))
# NUM_BROWSERS = 0

# for f in files:
# 	if '.DS_Store' not in f:
# 		NUM_BROWSERS =  NUM_BROWSERS + 1

REPO = os.path.abspath(os.path.join(os.path.dirname(__file__), '.'))
DB_DIR = os.path.join(REPO, 'db')
# db_loc = DB_DIR + "/amazon-crawler.sqlite"

product_urls = TaskManager.load_products()
# print product_urls
manager_params, browser_params =TaskManager.load_amazon_params()

# for i in xrange(NUM_BROWSERS):
#     browser_params[i]['disable_flash'] = True
# for i in xrange(NUM_BROWSERS):
    # browser_params[i]['disable_flash'] = True

#debug
# browser_params  = [browser_params[0]]
# manager = TaskManager.TaskManager(DB_DIR, browser_params, 1)
# manager_params['data_directory'] = '~/OpenWPM/'
manager = TaskManager.TaskManager(manager_params, browser_params,True)

Example #27
0
def run_openwpm(sites, data_directory, run_id, data_base_name):
    """
    run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db
    
    """
    print 'number of passed typo candidates ', len(sites)
    NUM_BROWSERS = 3

    try:
        print data_directory
        print run_id

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        picked_typo_candidates = set([])
        # Visits the sites with all browsers simultaneously
        for typo_candidate in sites:
            picked_typo_candidates.add("http://" + typo_candidate)
            if len(picked_typo_candidates) % 400 == 399:
                time.sleep(10)
                manager_params, browser_params = TaskManager.load_default_params(
                    NUM_BROWSERS)

                # Update browser configuration (use this for per-browser settings)
                for i in range(NUM_BROWSERS):
                    # Record HTTP Requests and Responses
                    browser_params[i]['http_instrument'] = True
                    # Enable flash for all three browsers
                    browser_params[i]['disable_flash'] = True
                    browser_params[0][
                        'headless'] = True  # Launch only browser 0 headless
                manager_params['data_directory'] = data_directory
                manager_params['log_directory'] = data_directory
                manager_params['run_id'] = run_id
                manager_params['database_name'] = data_base_name
                manager = TaskManager.TaskManager(manager_params,
                                                  browser_params)
                for site in picked_typo_candidates:
                    command_sequence = CommandSequence.CommandSequence(site)

                    # Start by visiting the page
                    command_sequence.get(sleep=0, timeout=30)

                    # dump_profile_cookies/dump_flash_cookies closes the current tab.
                    #command_sequence.dump_profile_cookies(120)

                    # index='**' synchronizes visits between the three browsers
                    manager.execute_command_sequence(command_sequence,
                                                     index='**')

                # Shuts down the browsers and waits for the data to finish logging
                manager.close()
                picked_typo_candidates = set([])

        manager_params, browser_params = TaskManager.load_default_params(
            NUM_BROWSERS)

        # Update browser configuration (use this for per-browser settings)
        for i in range(NUM_BROWSERS):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = True
            browser_params[0][
                'headless'] = True  # Launch only browser 0 headless
        manager_params['data_directory'] = data_directory
        manager_params['log_directory'] = data_directory
        manager_params['run_id'] = run_id
        manager_params['database_name'] = data_base_name
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in picked_typo_candidates:
            command_sequence = CommandSequence.CommandSequence(site)

            # Start by visiting the page
            command_sequence.get(sleep=0, timeout=30)

            # dump_profile_cookies/dump_flash_cookies closes the current tab.
            #command_sequence.dump_profile_cookies(120)

            # index='**' synchronizes visits between the three browsers
            manager.execute_command_sequence(command_sequence, index='**')

        # Shuts down the browsers and waits for the data to finish logging
        manager.close()
        picked_typo_candidates = set([])
    except:
        #print ValueError
        pass
Example #28
0
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = [
    'http://www.example.com', 'https://princeton.edu',
    'https://citp.princeton.edu/'
]

# Creates a temporary directory, where we will save the crawl DB
db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()

browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**')  # ** = synchronized browsers
    time.sleep(5)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
Example #29
0
from automation import TaskManager
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = ['http://www.example.com',
         'https://princeton.edu',
         'https://citp.princeton.edu/']

# Creates a temporary directory, where we will save the crawl DB
db_loc  = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()


browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**') # ** = synchronized browsers
    time.sleep(5)

# Shuts down the browsers and waits for the data to finish logging
Example #30
0
import time
import glob
import sys
import os

FB_USERNAME = ''
FB_PASSWORD = ''

USERS = set()
USERS.add('')

if not os.path.exists(os.path.join(os.path.dirname(__file__), '../data')):
    os.mkdir(os.path.join(os.path.dirname(__file__), '../data'))
db_loc = os.path.join(os.path.dirname(__file__), '../data/facebook.sqlite')

browser_params = TaskManager.load_default_params(1)
browser_params[0]['proxy'] = False

# don't double crawl
outdir = glob.glob('../data/fbfriends/*')
crawled = set()
for fname in outdir:
    crawled.add(fname.split('/')[-1][0:-8])
users = USERS.difference(crawled)

print "len of users to crawl: " + str(len(users))

if len(users) == 0:
    print "No users to crawl, exiting..."
    sys.exit(0)