コード例 #1
0
ファイル: nyt_ads_analysis.py プロジェクト: Jasonmk47/OpenWPM
def main():
    with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile:
        soup = BeautifulSoup(myfile.read(), 'lxml')
        links = []
        with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile:
            for item in soup.find_all('a', attrs={'data-link': True}):
                if "data-link" in item.attrs:
                    if ".html" in item['data-link']:
                        outfile.write(item['data-link'])
                        outfile.write("\n")
                        links.append(item['data-link'])

    # Go and dump the source for each
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(links):
        command_sequence = CommandSequence.CommandSequence(link)
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_ad_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
コード例 #2
0
def main():
    pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$")

    wpm_db = "/home/jason/Desktop/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    native_ad_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/analysis'
    manager_params['log_directory'] = '~/Desktop/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(native_ad_links):
        if not pattern.match(link[1]):
            print idx
            print link
            command_sequence = CommandSequence.CommandSequence(link[1])
            command_sequence.get(sleep=0, timeout=180)
            command_sequence.dump_page_source("ads" + str(idx), 120)
            manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
コード例 #3
0
    def init_manager(self):
        # Loads the manager preference and NUM_BROWSERS copies of the default browser dictionaries
        manager_params, browser_params = TaskManager.load_default_params(
            self.num_browsers)
        assignments = self.randomized_assignments()
        self.block_assignments.append(assignments)

        # Update browser configuration (use this for per-browser settings)
        for i in range(self.num_browsers):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = False
            browser_params[i]['headless'] = True
            browser_params[i]['control'] = assignments[i]

        # Update TaskManager configuration (use this for crawl-wide settings)
        manager_params['data_directory'] = self.data_directory
        manager_params['log_directory'] = '~/Desktop/'

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        try:
            manager = TaskManager.TaskManager(manager_params, browser_params)
        except TypeError:
            raise Exception("Failed to start the manager")
        self.manager = manager
コード例 #4
0
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000):
    NUM_BROWSERS = number_of_browsers
    SITES = ['http://' + x for x in cu.sample_top_sites(
                                location=os.path.expanduser('~/Desktop/'), 
                                slices=[(10000, 0, 10000), (10000, 10000, slice_end)])]

    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    for i in range(NUM_BROWSERS):
        browser_params[i]['cookie_instrument'] = True
        browser_params[i]['js_instrument'] = True
        browser_params[i]['save_javascript'] = True
        browser_params[i]['http_instrument'] = True
        browser_params[i]['headless'] = True
        browser_params[i]['disable_flash'] = False
        browser_params[i]['save_documents'] = True
        browser_params[i]['caching_disabled'] = True

    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    count = 0
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for site in SITES[0:exit_crawl_after]:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=10, timeout=60)
        command_sequence.scroll_page()
        command_sequence.recursive_dump_page_source()
        manager.execute_command_sequence(command_sequence)
    
        count += 1
        if count % 1000 == 0:
            print "Total crawled: ", count
    manager.close()
コード例 #5
0
def main():

    wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    article_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(article_links):
        print idx
        print link
        command_sequence = CommandSequence.CommandSequence(link[1])
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_articles_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
コード例 #6
0
ファイル: msc_UseOpenWPM.py プロジェクト: waldu/AdPExT
def extract_via_openwpm(sites):
    """
    Utilise the OpenWPM package to extract the browser parameters for 
    provided sites
    """

    print '########## OpenWPM (start) (Englehardt, 2016) ##########'
    # The number of browsers to use to extract the data
    num_of_browsers = 1

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(
        num_of_browsers)

    # Update browser configuration (use this for per-browser settings)
    for i in xrange(num_of_browsers):
        browser_params[i][
            'disable_flash'] = False  # Enable flash for all three browsers
    # browser_params[0]['headless'] = True #Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visit the sites
    for site in sites:
        manager.get(site, index='**')  # ** = synchronized browsers

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
    print '########## OpenWPM (end) (Englehardt, 2016) ##########'
コード例 #7
0
def dump_crawl(sites,profile_name):
    #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    # The list of sites that we wish to crawl
    print sites,profile_name
    NUM_BROWSERS = 1 #3
    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        #browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = True
        browser_params[i]['headless'] = True  # Launch all and not only browser 0 headless
        browser_params[i]['js_instrument'] = True
#        browser_params[i]['save_javascript'] = True
        #browser_params[i]['random_attributes']=True
        browser_params[i]['cookie_instrument']=True
     #   browser_params[i]['cp_instrument']=True
#        browser_params[i]['save_all_content']=True
        if 'load_name' in locals():
            browser_params[i]['profile_tar']=load_name
        browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name
        
    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/OpenWPM/'
    manager_params['log_directory'] = '~/OpenWPM/'
    manager_params['database_name']= "persona.sqlite"

   

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for i in range(0,len(sites)):
        print sites[i]
        site=sites[i]    
        command_sequence = CommandSequence.CommandSequence(site)
        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=300)
        # index='**' synchronizes visits between the three browsers
        #command_sequence.dump_profile_cookies(120)
    	#command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True)
	manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS))
	time.sleep(2)
    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    # dump stores history last cookies/sites only stored 
    #    os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    #command_sequence.dump_profile_cookies(120)
    #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout)
	# Shuts down the browsers and waits for the data to finish logging
    manager.close()
コード例 #8
0
def callWPM(NUM_BROWSERS, siteslist):
    print("Thread-----------thread-------------thread-----")
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
    browser_params[0]['http_instrument'] = True
    browser_params[0]['disable_flash'] = False
    browser_params[0]['headless'] = True
    manager_params['data_directory'] = '../database/requestdata2/'
    manager_params['log_directory'] = '../database/requestdata2/'
    manager = TaskManager.TaskManager(manager_params, browser_params)
    for site in siteslist:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=0, timeout=10)
        manager.execute_command_sequence(command_sequence, index='**')
    manager.close()
コード例 #9
0
def run_demo(url):
    NUM_BROWSERS = 1
    sites = [str(url)]

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(
        NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = False
        browser_params[i]['js_instrument'] = True
    if platform != 'darwin':
        browser_params[0]['headless'] = True  # Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = 'feature_extraction/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for site in sites:
        command_sequence = CommandSequence.CommandSequence(site)

        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=60)

        # dump_profile_cookies/dump_flash_cookies closes the current tab.
        command_sequence.dump_profile_cookies(120)

        # index='**' synchronizes visits between the three browsers
        manager.execute_command_sequence(command_sequence, index='**')

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
コード例 #10
0
from automation import CommandSequence, TaskManager
import pandas as pd

# Declare constants
NUM_BROWSERS = 3

# Load a pd dataframe using the given csv file
top100_df = pd.read_csv('top-1m.csv', header=None, nrows=100, usecols=[1])

# Load the default manager params and NUM_BROWSER copies of the default browser params
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in range(NUM_BROWSERS):
    # Record HTTP Requests and Responses
    browser_params[i]['http_instrument'] = True
    # Record cookie changes
    browser_params[i]['cookie_instrument'] = True
    # Record JS Web API calls
    browser_params[i]['js_instrument'] = True

    # Set true for ad blocking mode. Set false for vanilla mode.
    browser_params[i]['ublock-origin'] = False

    # Do not record the callstack of all WebRequests made
    browser_params[i]['callstack_instrument'] = False
    # Do not record Navigations
    browser_params[i]['navigation_instrument'] = False

    # Set the display quality to headful
    browser_params[i]['display mode'] = 'headful'
コード例 #11
0
import sys
from automation import TaskManager, CommandSequence

# Load the sites of sites we we wish to crawl into a list
# E.g. ['http://www.example.com', 'http://dataskydd.net']
sites = [line.rstrip('\n') for line in open('municipalities_final_urls.txt')]

manager_params, browser_params = TaskManager.load_default_params(1)

browser_params[0]['headless'] = True #Launch browser headless
browser_params[0]['http_instrument'] = True # Record HTTP Requests and Responses
browser_params[0]['cookie_instrument'] = True # Records both JS cookies and HTTP response cookies to javascript_cookies

manager_params['data_directory'] = './data/'
manager_params['log_directory'] = './data/'

manager = TaskManager.TaskManager(manager_params, browser_params)

for site in sites:
    command_sequence = CommandSequence.CommandSequence(site, reset=True)
    command_sequence.browse(num_links=5, sleep=10, timeout=360)
    command_sequence.dump_profile_cookies(120)
    manager.execute_command_sequence(command_sequence, index='**')

manager.close()
コード例 #12
0
ファイル: demo.py プロジェクト: AntBean/OpenWPM
from automation import TaskManager, CommandSequence

# The list of sites that we wish to crawl
NUM_BROWSERS = 3
sites = ['http://www.example.com',
         'http://www.princeton.edu',
         'http://citp.princeton.edu/']

# Loads the manager preference and 3 copies of the default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

# Update browser configuration (use this for per-browser settings)
for i in xrange(NUM_BROWSERS):
    browser_params[i]['disable_flash'] = False #Enable flash for all three browsers
browser_params[0]['headless'] = True #Launch only browser 0 headless

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/'
manager_params['log_directory'] = '~/Desktop/'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites:
    command_sequence = CommandSequence.CommandSequence(site)
    command_sequence.get(sleep=0, timeout=60)
    command_sequence.dump_profile_cookies(120)
    manager.execute_command_sequence(command_sequence, index='**') # ** = synchronized browsers
コード例 #13
0
#!/usr/bin/python

from automation import CommandSequence, TaskManager


def run_custom_function(**kwargs):
    driver = kwargs['driver']
    url_title = driver.title
    print("Title: %s" % url_title)
    return


if __name__ == "__main__":
    url_list = ["https://google.com"]

    manager_params, browser_params = TaskManager.load_default_params(1)
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for URL in url_list:
        cs = CommandSequence.CommandSequence(URL)
        cs.get(sleep=10, timeout=60)
        cs.run_custom_function(run_custom_function)
        manager.execute_command_sequence(cs)

    manager.close()
コード例 #14
0
ファイル: demo.py プロジェクト: Ryan-Sheehan/NothingToHide
def analyze_sites(sites):
    # The list of sites that we wish to crawl
    NUM_BROWSERS = 2
    #sites = [
    #    "https://www.cnn.com",
    #    "https://www.tufts.edu"
    #]

    # Loads the default manager params
    # and NUM_BROWSERS copies of the default browser params
    manager_params, browser_params = TaskManager.load_default_params(
        NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]["http_instrument"] = True
        # Record cookie changes
        browser_params[i]["cookie_instrument"] = True
        # Record Navigations
        browser_params[i]["navigation_instrument"] = True
        # Record JS Web API calls
        browser_params[i]["js_instrument"] = True
        # Record the callstack of all WebRequests made
        browser_params[i]["callstack_instrument"] = True
        # Record DNS resolution
        browser_params[i]["dns_instrument"] = True

    # Launch only browser 0 headless
    browser_params[0]["display_mode"] = "headless"

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params["data_directory"] = "~/Desktop/testing/"
    manager_params["log_directory"] = "~/Desktop/testing/"

    manager_params['output_format'] = 's3'
    manager_params['s3_bucket'] = 'ihavenothingtohide'
    manager_params['s3_directory'] = '2020-2'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites
    for site in sites:

        # Parallelize sites over all number of browsers set above.
        command_sequence = CommandSequence.CommandSequence(
            site,
            reset=True,
            callback=lambda success, val=site: print("CommandSequence {} done".
                                                     format(val)),
        )

        # Start by visiting the page
        command_sequence.get(sleep=3, timeout=60)

        # Run commands across the three browsers (simple parallelization)
        manager.execute_command_sequence(command_sequence)

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
コード例 #15
0
ファイル: try.py プロジェクト: lightnarcissus/PlayfulMesh
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = [
    'http://www.example.com', 'https://princeton.edu',
    'https://citp.princeton.edu/'
]

# Creates a temporary directory, where we will save the crawl DB
db_loc = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()

browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**')  # ** = synchronized browsers
    time.sleep(5)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
コード例 #16
0
def run_openwpm(sites, data_directory, run_id, data_base_name):
    """
    run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db
    
    """
    print 'number of passed typo candidates ', len(sites)
    NUM_BROWSERS = 3

    try:
        print data_directory
        print run_id

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        picked_typo_candidates = set([])
        # Visits the sites with all browsers simultaneously
        for typo_candidate in sites:
            picked_typo_candidates.add("http://" + typo_candidate)
            if len(picked_typo_candidates) % 400 == 399:
                time.sleep(10)
                manager_params, browser_params = TaskManager.load_default_params(
                    NUM_BROWSERS)

                # Update browser configuration (use this for per-browser settings)
                for i in range(NUM_BROWSERS):
                    # Record HTTP Requests and Responses
                    browser_params[i]['http_instrument'] = True
                    # Enable flash for all three browsers
                    browser_params[i]['disable_flash'] = True
                    browser_params[0][
                        'headless'] = True  # Launch only browser 0 headless
                manager_params['data_directory'] = data_directory
                manager_params['log_directory'] = data_directory
                manager_params['run_id'] = run_id
                manager_params['database_name'] = data_base_name
                manager = TaskManager.TaskManager(manager_params,
                                                  browser_params)
                for site in picked_typo_candidates:
                    command_sequence = CommandSequence.CommandSequence(site)

                    # Start by visiting the page
                    command_sequence.get(sleep=0, timeout=30)

                    # dump_profile_cookies/dump_flash_cookies closes the current tab.
                    #command_sequence.dump_profile_cookies(120)

                    # index='**' synchronizes visits between the three browsers
                    manager.execute_command_sequence(command_sequence,
                                                     index='**')

                # Shuts down the browsers and waits for the data to finish logging
                manager.close()
                picked_typo_candidates = set([])

        manager_params, browser_params = TaskManager.load_default_params(
            NUM_BROWSERS)

        # Update browser configuration (use this for per-browser settings)
        for i in range(NUM_BROWSERS):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = True
            browser_params[0][
                'headless'] = True  # Launch only browser 0 headless
        manager_params['data_directory'] = data_directory
        manager_params['log_directory'] = data_directory
        manager_params['run_id'] = run_id
        manager_params['database_name'] = data_base_name
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in picked_typo_candidates:
            command_sequence = CommandSequence.CommandSequence(site)

            # Start by visiting the page
            command_sequence.get(sleep=0, timeout=30)

            # dump_profile_cookies/dump_flash_cookies closes the current tab.
            #command_sequence.dump_profile_cookies(120)

            # index='**' synchronizes visits between the three browsers
            manager.execute_command_sequence(command_sequence, index='**')

        # Shuts down the browsers and waits for the data to finish logging
        manager.close()
        picked_typo_candidates = set([])
    except:
        #print ValueError
        pass
コード例 #17
0
from automation import TaskManager
import tempfile
import time
import os
import copy
import json

# The list of sites that we wish to crawl
sites = ['http://www.example.com',
         'https://princeton.edu',
         'https://citp.princeton.edu/']

# Creates a temporary directory, where we will save the crawl DB
db_loc  = tempfile.mkdtemp() + '/openwpm_demo.sqlite'

preferences = TaskManager.load_default_params()


browser_params = [copy.deepcopy(preferences) for i in xrange(0, 3)]

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances which log data using mitmproxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, 3)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**') # ** = synchronized browsers
    time.sleep(5)

# Shuts down the browsers and waits for the data to finish logging