Esempio n. 1
0
    browser_params[i]['profile_tar'] = 'Output/Profiles/Facebook_3'

    #self written
    browser_params[i]['scroll_down'] = False
    browser_params[i]['login'] = False
    browser_params[i]['execute_tshark'] = False

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = 'Output/Data'
manager_params['log_directory'] = 'Output/Data'
#manager_params['database_name'] = 'output.sqlite'

default_timeout = 60
default_sleep = 5

manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites_to_crawl:
    # define crawl actions
    command_sequence_get1 = CommandSequence.CommandSequence(site, reset=False)
    command_sequence_get1.get(step=0,
                              sleep=default_sleep,
                              timeout=default_timeout)
    command_sequence_get1.dump_profile_cookies(timeout=default_timeout)
    command_sequence_get1.dump_flash_cookies(timeout=default_timeout)

    manager.execute_command_sequence(command_sequence_get1,
                                     index='**')  # ** = synchronized browsers

# Shuts down the browsers and waits for the data to finish logging
Esempio n. 2
0
    'http://www.example.com', 'http://www.princeton.edu',
    'https://citp.princeton.edu/'
]

# Saves a crawl output DB to the Desktop
db_loc = os.path.expanduser('~/Desktop/openwpm_demo.sqlite')

# Loads 3 copies of the default browser preference dictionaries
browser_params = TaskManager.load_default_params(NUM_BROWSERS)

#Enable flash for all three browsers
for i in xrange(NUM_BROWSERS):
    browser_params[i]['disable_flash'] = False

#Launch the first browser headless
browser_params[0]['headless'] = True

# Instantiates the measurement platform
# Launches two (non-headless) Firefox instances and one headless instance
# logging data with MITMProxy
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(db_loc, browser_params, NUM_BROWSERS)

# Visits the sites with both browsers simultaneously, 5 seconds between visits
for site in sites:
    manager.get(site, index='**')  # ** = synchronized browsers
    time.sleep(5)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
    browser_params[i]['navigation_instrument'] = True
    # Launch headless
    browser_params[i]['display_mode'] = 'headless'

cwd = os.getcwd() + "/temp"
manager_params['data_directory'] = cwd
manager_params['log_directory'] = cwd

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
logger_params = {
        "log_level_console": logging.CRITICAL
        }
manager = TaskManager.TaskManager(
        manager_params,
        browser_params,
        logger_kwargs=logger_params
        )

# Define command sequence
command_sequence = CommandSequence.CommandSequence(
    site, reset=True,
    callback=lambda success, val=site:
    print("CommandSequence {} done".format(val))
    )

command_sequence.get(sleep=3, timeout=60)

def search_and_visit(query, n, **kwargs):
    driver = kwargs['driver']
    input_elem = driver.find_element_by_name("q")
Esempio n. 4
0
            raise
        except Exception:
            continue
        if href is None:
            continue
        href = urlparse.urljoin(driver.current_url, href)
        if (not href.startswith('http') or du.get_ps_plus_1(href) != top_ps1):
            continue
        intra_links.add(urlparse.urldefrag(href)[0])

    with open(out_file, 'w') as f:
        json.dump([top_url, list(intra_links)], f)


manager = TaskManager.TaskManager(manager_params,
                                  browser_params,
                                  process_watchdog=True)
current_index = 0
for i in range(start_index, end_index):
    current_index = i
    if current_index >= TOTAL_NUM_SITES:
        break
    try:
        url = sites[i]
        cs = CommandSequence.CommandSequence('http://' + url, reset=True)
        cs.get(sleep=10, timeout=120)
        cs.run_custom_function(save_links_to_file,
                               func_args=(url, ),
                               timeout=120)
        manager.execute_command_sequence(cs)
        with open(os.path.expanduser('~/.openwpm/current_site_index'),
Esempio n. 5
0
manager_params['log_directory'] = '~/Desktop/%s/' % CRAWL_DIRECTORY
manager_params['output_format'] = 's3'
manager_params['s3_bucket'] = S3_BUCKET
manager_params['s3_directory'] = CRAWL_DIRECTORY

# Allow the use of localstack's mock s3 service
S3_ENDPOINT = os.getenv('S3_ENDPOINT')
if S3_ENDPOINT:
    boto3.DEFAULT_SESSION = LocalS3Session(endpoint_url=S3_ENDPOINT)
    manager_params['s3_bucket'] = local_s3_bucket(boto3.resource('s3'),
                                                  name=S3_BUCKET)

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params,
                                  browser_params,
                                  logger_kwargs=LOGGER_SETTINGS)

# At this point, Sentry should be initiated
if SENTRY_DSN:
    # Add crawler.py-specific context
    with sentry_sdk.configure_scope() as scope:
        # tags generate breakdown charts and search filters
        scope.set_tag('CRAWL_DIRECTORY', CRAWL_DIRECTORY)
        scope.set_tag('S3_BUCKET', S3_BUCKET)
        scope.set_tag('HTTP_INSTRUMENT', HTTP_INSTRUMENT)
        scope.set_tag('COOKIE_INSTRUMENT', COOKIE_INSTRUMENT)
        scope.set_tag('NAVIGATION_INSTRUMENT', NAVIGATION_INSTRUMENT)
        scope.set_tag('JS_INSTRUMENT', JS_INSTRUMENT)
        scope.set_tag('JS_INSTRUMENT_MODULES', JS_INSTRUMENT)
        scope.set_tag('SAVE_CONTENT', SAVE_CONTENT)