예제 #1
0
def main():

    wpm_db = "/home/jason/Desktop/NYT/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    article_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(article_links):
        print idx
        print link
        command_sequence = CommandSequence.CommandSequence(link[1])
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_articles_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
예제 #2
0
def main():
    with open('/home/jason/Desktop/NYT/sources/html.html', 'r') as myfile:
        soup = BeautifulSoup(myfile.read(), 'lxml')
        links = []
        with open('/home/jason/Desktop/NYT/sources/links.txt', 'w') as outfile:
            for item in soup.find_all('a', attrs={'data-link': True}):
                if "data-link" in item.attrs:
                    if ".html" in item['data-link']:
                        outfile.write(item['data-link'])
                        outfile.write("\n")
                        links.append(item['data-link'])

    # Go and dump the source for each
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/NYT/analysis'
    manager_params['log_directory'] = '~/Desktop/NYT/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(links):
        command_sequence = CommandSequence.CommandSequence(link)
        command_sequence.get(sleep=0, timeout=180)
        command_sequence.dump_page_source("nyt_ad_" + str(idx), 120)
        manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
예제 #3
0
def main():
    pattern = re.compile("https?://www.theatlantic.com/[A-Za-z0-9-]*/$")

    wpm_db = "/home/jason/Desktop/crawl-data.sqlite"
    conn = sql.connect(wpm_db)
    cur = conn.cursor()
    cur.execute(SQL_Query)
    native_ad_links = cur.fetchall()

    # Loads the manager preference and sthe default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(1)

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/Desktop/analysis'
    manager_params['log_directory'] = '~/Desktop/analysis'
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for idx, link in enumerate(native_ad_links):
        if not pattern.match(link[1]):
            print idx
            print link
            command_sequence = CommandSequence.CommandSequence(link[1])
            command_sequence.get(sleep=0, timeout=180)
            command_sequence.dump_page_source("ads" + str(idx), 120)
            manager.execute_command_sequence(command_sequence, index="**")

    manager.close()
def crawl_data(number_of_browsers = 1, exit_crawl_after = 5, slice_end = 1000000):
    NUM_BROWSERS = number_of_browsers
    SITES = ['http://' + x for x in cu.sample_top_sites(
                                location=os.path.expanduser('~/Desktop/'), 
                                slices=[(10000, 0, 10000), (10000, 10000, slice_end)])]

    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    for i in range(NUM_BROWSERS):
        browser_params[i]['cookie_instrument'] = True
        browser_params[i]['js_instrument'] = True
        browser_params[i]['save_javascript'] = True
        browser_params[i]['http_instrument'] = True
        browser_params[i]['headless'] = True
        browser_params[i]['disable_flash'] = False
        browser_params[i]['save_documents'] = True
        browser_params[i]['caching_disabled'] = True

    manager_params['data_directory'] = '~/Desktop/'
    manager_params['log_directory'] = '~/Desktop/'

    count = 0
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for site in SITES[0:exit_crawl_after]:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=10, timeout=60)
        command_sequence.scroll_page()
        command_sequence.recursive_dump_page_source()
        manager.execute_command_sequence(command_sequence)
    
        count += 1
        if count % 1000 == 0:
            print "Total crawled: ", count
    manager.close()
예제 #5
0
    def run_search_google_training_by_multiple_commands(self, tmpdir):
        """visit all the training site. each visit is a single command
        """
        # get the size of training sites, and visit one by one using index in their list;
        # this is to avoid the problem of there is error when visit one site could stop whole
        # visiting process (in case of using single CommandSequence);
        # all the browser must have the same number of visting site
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS)
        with open(browser_params[0]['training_keywords']) as _f:
            _sites = [site for site in _f]
        nu_sites = len(_sites)
        cs = CommandSequence.CommandSequence("http://www.example.com")
        #cs2 = CommandSequence.CommandSequence("none") # url is a place holder
        #cs.get(sleep=3)
        #cs2.login_google()
        #manager.execute_command_sequence(cs2, index="**")

        for i in range(0, nu_sites):
            cs.single_search_google_shopping_by_index(i, -1, training=True)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(5)
        manager.close()
        print("finish....")
def crawl_site(site, manager, user_data):
    command_sequence = CommandSequence.CommandSequence(site, reset=True)
    command_sequence.fill_forms(user_data=user_data,
                                num_links=3,
                                timeout=120,
                                page_timeout=8,
                                debug=True)
    manager.execute_command_sequence(command_sequence)
예제 #7
0
 def _stateless_crawl(self, sites):
     '''Performs a crawl with sites providing login'''
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     for site in sites:
         params = self._fetch_params(site)
         commandseq = CommandSequence.CommandSequence(site, reset=True)
         commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT)
         commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT)
         manager.execute_command_sequence(commandseq, index=None)
     manager.close()
예제 #8
0
def dump_crawl(sites,profile_name):
    #os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    # The list of sites that we wish to crawl
    print sites,profile_name
    NUM_BROWSERS = 1 #3
    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        #browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = True
        browser_params[i]['headless'] = True  # Launch all and not only browser 0 headless
        browser_params[i]['js_instrument'] = True
#        browser_params[i]['save_javascript'] = True
        #browser_params[i]['random_attributes']=True
        browser_params[i]['cookie_instrument']=True
     #   browser_params[i]['cp_instrument']=True
#        browser_params[i]['save_all_content']=True
        if 'load_name' in locals():
            browser_params[i]['profile_tar']=load_name
        browser_params[i]['profile_archive_dir']="/home/ubuntu/personas/"+profile_name
        
    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = '~/OpenWPM/'
    manager_params['log_directory'] = '~/OpenWPM/'
    manager_params['database_name']= "persona.sqlite"

   

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for i in range(0,len(sites)):
        print sites[i]
        site=sites[i]    
        command_sequence = CommandSequence.CommandSequence(site)
        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=300)
        # index='**' synchronizes visits between the three browsers
        #command_sequence.dump_profile_cookies(120)
    	#command_sequence.dump_profile(dump_folder="~/personas/", close_webdriver=True)
	manager.execute_command_sequence(command_sequence,(i%NUM_BROWSERS))
	time.sleep(2)
    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    # dump stores history last cookies/sites only stored 
    #    os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
    #command_sequence.dump_profile_cookies(120)
    #command_sequence.dump_profile(dump_folder="~/personas/"+profile_name, closer_webdriver=True, compress, timeout)
	# Shuts down the browsers and waits for the data to finish logging
    manager.close()
예제 #9
0
 def _statefull_crawl(self, loginsite, sites):
     '''Performs crawl by logging into one site and regularly crawling others'''
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     # login to given page
     params = self._fetch_params(loginsite)
     commandseq = CommandSequence.CommandSequence(loginsite)
     commandseq.get(sleep=self.DEF_SLEEP, timeout=self.DEF_TIMEOUT)
     commandseq.login(logindata=params, timeout=self.DEF_TIMEOUT)
     manager.execute_command_sequence(commandseq, index=None)
     # proceed to crawl pages
     for site in sites:
         # we run a stateless crawl (fresh profile for each page)
         command_sequence = CommandSequence.CommandSequence(site)
         # Start by visiting the page
         command_sequence.get(sleep=self.DEF_SLEEP,
                              timeout=self.DEF_TIMEOUT)
         # dump_profile_cookies/dump_flash_cookies closes the current tab.
         command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME)
         command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME)
         manager.execute_command_sequence(command_sequence, index=None)
     manager.close()
예제 #10
0
def callWPM(NUM_BROWSERS, siteslist):
    print("Thread-----------thread-------------thread-----")
    manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)
    browser_params[0]['http_instrument'] = True
    browser_params[0]['disable_flash'] = False
    browser_params[0]['headless'] = True
    manager_params['data_directory'] = '../database/requestdata2/'
    manager_params['log_directory'] = '../database/requestdata2/'
    manager = TaskManager.TaskManager(manager_params, browser_params)
    for site in siteslist:
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=0, timeout=10)
        manager.execute_command_sequence(command_sequence, index='**')
    manager.close()
예제 #11
0
 def run_once(self):
     for stage in self.stages:
         command_sequence = CommandSequence.CommandSequence(stage.site)
         if isinstance(stage.actions, list):
             for action in stage.actions:
                 action(command_sequence)
         else:
             stage.actions(command_sequence)
         if stage.group == 'experimental':
             self.manager.execute_command_sequence(command_sequence,
                                                   index='experimental')
         else:
             self.manager.execute_command_sequence(command_sequence,
                                                   index='**')
     self.manager.close()
예제 #12
0
    def browser_training_site(self, tmpdir):
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        #manager_params, browser_params = TaskManager.load_default_params(self.NUM_BROWSERS)
        with open(browser_params[0]['training_sites']) as _f:
            _sites = [site for site in _f]
        nu_sites = len(_sites)
        cs = CommandSequence.CommandSequence("http://www.example.com")
        #cs.get()
        for i in range(len(_sites)):
            cs.browse_site_by_index(i, 3)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(10)
        manager.close()
예제 #13
0
 def crawl(self, sites):
     '''Runs crawl resulting in dataset for unsupervised tracking detection
     Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de'''
     self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE)
     self.browserpar['disable_flash'] = True
     for _ in range(0, self.NUM_USERS):
         manager = TaskManager.TaskManager(self.managerpar,
                                           [self.browserpar])
         for site in sites:
             for _ in range(0, self.NUM_VISITS):
                 command_sequence = CommandSequence.CommandSequence(site)
                 command_sequence.get(sleep=self.DEF_SLEEP,
                                      timeout=self.DEF_TIMEOUT)
                 manager.execute_command_sequence(command_sequence,
                                                  index=None)
         manager.close()
예제 #14
0
 def crawl(self, sites):
     '''Runs a crawl to measure various metrics regarding third-party tracking.
        Sites are expected as list including protocol, e.g. http://www.hdm-stuttgart.de'''
     self._set_dbname(sites, self.db_prefix, self.bpath, self.CRAWL_TYPE)
     manager = TaskManager.TaskManager(self.managerpar, [self.browserpar])
     for site in sites:
         # we run a stateless crawl (fresh profile for each page)
         command_sequence = CommandSequence.CommandSequence(site,
                                                            reset=True)
         # Start by visiting the page
         command_sequence.get(sleep=self.DEF_SLEEP,
                              timeout=self.DEF_TIMEOUT)
         # dump_profile_cookies/dump_flash_cookies closes the current tab.
         command_sequence.dump_profile_cookies(self.DEF_COOKIE_TIME)
         command_sequence.dump_flash_cookies(self.DEF_COOKIE_TIME)
         manager.execute_command_sequence(command_sequence, index=None)
     manager.close()
예제 #15
0
    def test(self, tmpdir):
        # Run the test crawl
        manager_params, browser_params = self.get_config(str(tmpdir))
        manager = TaskManager.TaskManager(manager_params, browser_params)

        # Set up two sequential get commands to two URLS
        cs = CommandSequence.CommandSequence("http://www.google.com")
        cs.get(sleep=3)
        #cs.get()
        #cs.login_google()
        #cs.search_google_shopping()
        #cs.single_search_google_shopping("food",training=False)
        #time.sleep(2)
        #cs.single_search_google_shopping("baby powder", number of link, trainig...)
        cs.multiple_search_google_shopping(-1, training=False, sleep_time=2)
        manager.execute_command_sequence(cs, index="**")
        #manager.get("http://www.google.com")
        time.sleep(15)
        manager.close(post_process=False)
        print("finish....")
예제 #16
0
def run_demo(url):
    NUM_BROWSERS = 1
    sites = [str(url)]

    # Loads the manager preference and 3 copies of the default browser dictionaries
    manager_params, browser_params = TaskManager.load_default_params(
        NUM_BROWSERS)

    # Update browser configuration (use this for per-browser settings)
    for i in range(NUM_BROWSERS):
        # Record HTTP Requests and Responses
        browser_params[i]['http_instrument'] = True
        # Enable flash for all three browsers
        browser_params[i]['disable_flash'] = False
        browser_params[i]['js_instrument'] = True
    if platform != 'darwin':
        browser_params[0]['headless'] = True  # Launch only browser 0 headless

    # Update TaskManager configuration (use this for crawl-wide settings)
    manager_params['data_directory'] = 'feature_extraction/'
    manager_params['log_directory'] = '~/Desktop/'

    # Instantiates the measurement platform
    # Commands time out by default after 60 seconds
    manager = TaskManager.TaskManager(manager_params, browser_params)

    # Visits the sites with all browsers simultaneously
    for site in sites:
        command_sequence = CommandSequence.CommandSequence(site)

        # Start by visiting the page
        command_sequence.get(sleep=0, timeout=60)

        # dump_profile_cookies/dump_flash_cookies closes the current tab.
        command_sequence.dump_profile_cookies(120)

        # index='**' synchronizes visits between the three browsers
        manager.execute_command_sequence(command_sequence, index='**')

    # Shuts down the browsers and waits for the data to finish logging
    manager.close()
예제 #17
0
    db_path = '/Users/sanjanaaithal/Desktop/Vanilla'

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = db_path
manager_params['log_directory'] = db_path

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visit the sites
for numpy_object in top100_df.to_numpy():
    # Turn numpy object into a site string
    site = 'http://' + str(numpy_object)[2:-2]

    # Parallelize sites over all number of browsers set above.
    command_sequence = CommandSequence.CommandSequence(
        site,
        reset=True,
        callback=lambda success, val=site: print("CommandSequence {} done".
                                                 format(val)))

    # Start by visiting the page
    command_sequence.get(sleep=3, timeout=60)

    # Run commands across the three browsers (simple parallelization)
    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
    browser_params[i]['disable_flash'] = True  # Disable flash for all browsers
    browser_params[i]['js_instrument'] = True  # Enable JS instrumentation
    browser_params[i]['save_javascript'] = True  # save JS files
    browser_params[i]['headless'] = True  # headless
    browser_params[i]['trigger_sensor_events'] = True  # fake sensor events
    browser_params[i]['mobile_platform'] = "android"  # or "iphone"

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/openwpm_mobile_100k/'
manager_params['log_directory'] = '~/openwpm_mobile_100k/'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for rank, site in enumerate(sites, 1):
    url = "http://%s" % site
    command_sequence = CommandSequence.CommandSequence(url, reset=True)

    # Start by visiting the page
    command_sequence.get(sleep=10, timeout=60)
    # command_sequence.save_screenshot('%d_%s_screenshot' % (rank, site))
    # dump_profile_cookies/dump_flash_cookies closes the current tab.
    command_sequence.dump_profile_cookies(120)

    manager.execute_command_sequence(command_sequence)

# Shuts down the browsers and waits for the data to finish logging
manager.close()
예제 #19
0
    # Send a sentry error message (temporarily - to easily be able
    # to compare error frequencies to crawl worker instance count)
    sentry_sdk.capture_message("Crawl worker started")

# Connect to job queue
job_queue = rediswq.RedisWQ(name=REDIS_QUEUE_NAME, host=REDIS_HOST)
manager.logger.info("Worker with sessionID: %s" % job_queue.sessionID())
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())

# Crawl sites specified in job queue until empty
while not job_queue.empty():
    job = job_queue.lease(lease_secs=120, block=True, timeout=5)
    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
    else:
        site_rank, site = job.decode("utf-8").split(',')
        if "://" not in site:
            site = "http://" + site
        manager.logger.info("Visiting %s..." % site)
        command_sequence = CommandSequence.CommandSequence(site, reset=True)
        command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
        manager.execute_command_sequence(command_sequence)
        job_queue.complete(job)

manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
    sentry_sdk.capture_message("Crawl worker finished")
예제 #20
0
manager.logger.info("Initial queue state: empty=%s" % job_queue.empty())

# Crawl sites specified in job queue until empty
while not job_queue.empty():
    job_queue.check_expired_leases()
    job = job_queue.lease(lease_secs=TIMEOUT + DWELL_TIME + 30,
                          block=True,
                          timeout=5)

    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
        continue

    retry_number = job_queue.get_retry_number(job)
    site_rank, site = job.decode("utf-8").split(',')
    if "://" not in site:
        site = "http://" + site
    manager.logger.info("Visiting %s..." % site)
    command_sequence = CommandSequence.CommandSequence(
        site, blocking=True, reset=True, retry_number=retry_number)
    command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
    manager.execute_command_sequence(command_sequence)
    job_queue.complete(job)

manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
    sentry_sdk.capture_message("Crawl worker finished")
예제 #21
0
#!/usr/bin/python

from automation import CommandSequence, TaskManager


def run_custom_function(**kwargs):
    driver = kwargs['driver']
    url_title = driver.title
    print("Title: %s" % url_title)
    return


if __name__ == "__main__":
    url_list = ["https://google.com"]

    manager_params, browser_params = TaskManager.load_default_params(1)
    manager = TaskManager.TaskManager(manager_params, browser_params)

    for URL in url_list:
        cs = CommandSequence.CommandSequence(URL)
        cs.get(sleep=10, timeout=60)
        cs.run_custom_function(run_custom_function)
        manager.execute_command_sequence(cs)

    manager.close()
예제 #22
0
from automation import TaskManager, CommandSequence

# Variables for what site
site = 'http://www.tbrandstudio.com/our-work/'

# Loads the manager preference and sthe default browser dictionaries
manager_params, browser_params = TaskManager.load_default_params(1)

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '~/Desktop/NYT'
manager_params['log_directory'] = '~/Desktop/NYT'

# Instantiates the measurement platform
# Commands time out by default after 60 seconds
manager = TaskManager.TaskManager(manager_params, browser_params)

command_sequence = CommandSequence.CommandSequence(site)

# Start by visiting the page
command_sequence.get(sleep=0, timeout=180)
#command_sequence.scroll_bottom(timeout=180)
command_sequence.dump_page_source("nyt_ads", 120)

manager.execute_command_sequence(command_sequence, index=None) 

# Shuts down the browsers and waits for the data to finish logging
manager.close()
예제 #23
0
    sites = category_sites[category]
    if category in set(['Recreation', 'Computers', 'Shopping']):
        continue
    f.write('{0}, {1}'.format(i, category))
    manager_params['data_directory'] = './{0}_data'.format(category)
    manager_params['database_name'] = 'crawl-data.sqlite'
    manager_params['log_file'] = 'openwpm.log'
    browser_params[0]['profile_archive_dir'] = './{0}_profile'.format(category)
    browser_params[0]['profile_tar'] = None
    manager = TaskManager.TaskManager(manager_params, browser_params)
    for site in sites:
        try:
            site = site.lower()
            if not site.startswith('http'):
                site = 'http://' + site
            command_sequence = CommandSequence.CommandSequence(site,
                                                               reset=False)
            # Start by visiting the page
            command_sequence.get(sleep=0, timeout=60)
            command_sequence.dump_page_source(site.split('/')[2], timeout=60)
            # dump_profile_cookies/dump_flash_cookies closes the current tab.
            command_sequence.dump_profile_cookies(120)
            command_sequence.dump_profile(category)
            # execute on the ith browser
            manager.execute_command_sequence(command_sequence, index=0)
        except:
            print 'failed on site: ' + site
            pass
    manager.close()
    i += 1

manager.close()
예제 #24
0
manager_params, browser_params = TaskManager.load_default_params(NUM_BROWSERS)

prefix = 'test_browse'
manager_params['database_name'] = prefix + '.sqlite'
manager_params['data_directory'] = '~/Desktop/' + prefix
manager_params['log_directory'] = '~/Desktop/' + prefix

# Read the site list
sites = crawl_utils.get_sampled_sites(
    location=manager_params['data_directory'],
    include_rank=True,
    slices=[(100, 0, 100)])

for i in xrange(NUM_BROWSERS):
    browser_params[i]['js_instrument'] = True
    browser_params[i]['cookie_instrument'] = True
    browser_params[i]['http_instrument'] = True
    browser_params[i]['save_javascript'] = True
    browser_params[i]['record_js_errors'] = True

manager = TaskManager.TaskManager(manager_params, browser_params)
current_index = 0
for i in range(len(sites)):
    cs = CommandSequence.CommandSequence(sites[i][0],
                                         site_rank=sites[i][1],
                                         reset=True)
    cs.browse(num_links=5, sleep=5, timeout=120)
    manager.execute_command_sequence(cs)
manager.close()
예제 #25
0
파일: scan.py 프로젝트: bkrumnow/BDScanner
# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = './Results/'
manager_params['log_directory'] = './Results/'

# Instantiates the measurement platform
manager = TaskManager.TaskManager(manager_params, browser_params)

fileReader = csv.reader(open('detection/alexa/top-1m.csv'), delimiter=',')

urls = []
for (index, url) in fileReader:
    urls.append(url)
del fileReader

for i in range(0, 1, 1):  #len(urls),1):
    url = urls[i]

    print("Command creation %s %s" % (i, url))
    #second parameter will clear the profile (reset)
    command_sequence = CommandSequence.CommandSequence('http://' + url, True)
    # Start by visiting the page
    command_sequence.get(sleep=3, timeout=120)
    #command_sequence.save_screenshot('EndPrint', 1000)
    command_sequence.detect_webbot_detection(timeout=360)
    # index='**' synchronizes visits between the three browsers
    manager.execute_command_sequence(command_sequence, index=None)
    del command_sequence

# Shuts down the browsers and waits for the data to finish logging
manager.close()
예제 #26
0
manager = TaskManager.TaskManager(manager_params,
                                  browser_params,
                                  process_watchdog=True)
current_index = 0
for i in range(start_index, end_index):
    current_index = i
    if current_index >= TOTAL_NUM_SITES:
        break
    try:
        try:
            first_party, rank, url = sites[i]
        except ValueError:
            continue
        cs = CommandSequence.CommandSequence(url,
                                             site_rank=rank,
                                             first_party=first_party,
                                             reset=True)
        cs.get(sleep=10, timeout=120)
        manager.execute_command_sequence(cs)
        with open(os.path.expanduser('~/.openwpm/current_site_index'),
                  'w') as f:
            f.write(str(i))
    except CommandExecutionError:
        with open(os.path.expanduser('~/.openwpm/reboot'), 'w') as f:
            f.write(str(1))
        break

print "CLOSING TaskManager after batch"
manager.close()

crawl_utils.clear_tmp_folder()
예제 #27
0
def run_openwpm(sites, data_directory, run_id, data_base_name):
    """
    run OpenWPM fromework for passed sites and other parameters to gather data in data_base_name db
    
    """
    print 'number of passed typo candidates ', len(sites)
    NUM_BROWSERS = 3

    try:
        print data_directory
        print run_id

        # Instantiates the measurement platform
        # Commands time out by default after 60 seconds
        picked_typo_candidates = set([])
        # Visits the sites with all browsers simultaneously
        for typo_candidate in sites:
            picked_typo_candidates.add("http://" + typo_candidate)
            if len(picked_typo_candidates) % 400 == 399:
                time.sleep(10)
                manager_params, browser_params = TaskManager.load_default_params(
                    NUM_BROWSERS)

                # Update browser configuration (use this for per-browser settings)
                for i in range(NUM_BROWSERS):
                    # Record HTTP Requests and Responses
                    browser_params[i]['http_instrument'] = True
                    # Enable flash for all three browsers
                    browser_params[i]['disable_flash'] = True
                    browser_params[0][
                        'headless'] = True  # Launch only browser 0 headless
                manager_params['data_directory'] = data_directory
                manager_params['log_directory'] = data_directory
                manager_params['run_id'] = run_id
                manager_params['database_name'] = data_base_name
                manager = TaskManager.TaskManager(manager_params,
                                                  browser_params)
                for site in picked_typo_candidates:
                    command_sequence = CommandSequence.CommandSequence(site)

                    # Start by visiting the page
                    command_sequence.get(sleep=0, timeout=30)

                    # dump_profile_cookies/dump_flash_cookies closes the current tab.
                    #command_sequence.dump_profile_cookies(120)

                    # index='**' synchronizes visits between the three browsers
                    manager.execute_command_sequence(command_sequence,
                                                     index='**')

                # Shuts down the browsers and waits for the data to finish logging
                manager.close()
                picked_typo_candidates = set([])

        manager_params, browser_params = TaskManager.load_default_params(
            NUM_BROWSERS)

        # Update browser configuration (use this for per-browser settings)
        for i in range(NUM_BROWSERS):
            # Record HTTP Requests and Responses
            browser_params[i]['http_instrument'] = True
            # Enable flash for all three browsers
            browser_params[i]['disable_flash'] = True
            browser_params[0][
                'headless'] = True  # Launch only browser 0 headless
        manager_params['data_directory'] = data_directory
        manager_params['log_directory'] = data_directory
        manager_params['run_id'] = run_id
        manager_params['database_name'] = data_base_name
        manager = TaskManager.TaskManager(manager_params, browser_params)
        for site in picked_typo_candidates:
            command_sequence = CommandSequence.CommandSequence(site)

            # Start by visiting the page
            command_sequence.get(sleep=0, timeout=30)

            # dump_profile_cookies/dump_flash_cookies closes the current tab.
            #command_sequence.dump_profile_cookies(120)

            # index='**' synchronizes visits between the three browsers
            manager.execute_command_sequence(command_sequence, index='**')

        # Shuts down the browsers and waits for the data to finish logging
        manager.close()
        picked_typo_candidates = set([])
    except:
        #print ValueError
        pass
예제 #28
0
                                         EXTENDED_LEASE_TIME):
                manager.logger.error("Unsaved job: %s timed out", unsaved_job)

    job = job_queue.lease(
        lease_secs=TIMEOUT + DWELL_TIME + 30, block=True, timeout=5
    )
    if job is None:
        manager.logger.info("Waiting for work")
        time.sleep(5)
        continue

    unsaved_jobs.append(job)
    retry_number = job_queue.get_retry_number(job)
    site_rank, site = job.decode("utf-8").split(',')
    if "://" not in site:
        site = "http://" + site
    manager.logger.info("Visiting %s..." % site)
    callback = get_job_completion_callback(
        manager.logger, unsaved_jobs_lock, job_queue, job)
    command_sequence = CommandSequence.CommandSequence(
        site, blocking=True, reset=True, retry_number=retry_number,
        callback=callback, site_rank=site_rank
    )
    command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT)
    manager.execute_command_sequence(command_sequence)
manager.logger.info("Job queue finished, exiting.")
manager.close()

if SENTRY_DSN:
    sentry_sdk.capture_message("Crawl worker finished")
예제 #29
0
    browser_params[i]['execute_tshark'] = False

# Update TaskManager configuration (use this for crawl-wide settings)
manager_params['data_directory'] = '/home/OpenWPM/Output/Data'
manager_params['log_directory'] = '/home/OpenWPM/Output/Data'
#manager_params['database_name'] = 'output.sqlite'

default_timeout = 60
default_sleep = 5

manager = TaskManager.TaskManager(manager_params, browser_params)

# Visits the sites with all browsers simultaneously
for site in sites_to_crawl:
    # define crawl actions
    command_sequence_get1 = CommandSequence.CommandSequence(site, reset=False)
    command_sequence_get1.get(step=0,
                              sleep=default_sleep,
                              timeout=default_timeout)
    command_sequence_get1.dump_profile_cookies(timeout=default_timeout)
    command_sequence_get1.dump_flash_cookies(timeout=default_timeout)

    command_sequence_get2 = CommandSequence.CommandSequence(site + "-sub1",
                                                            reset=False)
    command_sequence_get2.get(step=1,
                              sleep=default_sleep,
                              timeout=default_timeout)
    command_sequence_get2.dump_profile_cookies(timeout=default_timeout)
    command_sequence_get2.dump_flash_cookies(timeout=default_timeout)

    command_sequence_get3 = CommandSequence.CommandSequence(site + "-sub2",
예제 #30
0
        start_index = int(f.read()) + 1
    end_index = start_index + NUM_BATCH
else:
    start_index = 0
    end_index = NUM_BATCH + 1

# Start crawling
manager = TaskManager.TaskManager(manager_params, browser_params)
current_index = 0
for i in range(start_index, end_index):
    current_index = i
    if current_index >= TOTAL_NUM_SITES:
        break
    try:
        command_sequence = CommandSequence.CommandSequence(sites[i][1],
                                                           sites[i][0],
                                                           reset=True)
        command_sequence.get(sleep=10, timeout=60)
        manager.execute_command_sequence(command_sequence)
        with open(os.path.expanduser('~/.openwpm/current_site_index'),
                  'w') as f:
            f.write(str(i))
    except CommandExecutionError:
        with open(os.path.expanduser('~/.openwpm/stop'), 'w') as f:
            f.write(str(1))
        break

# Shut down and clean up after batch
manager.close()
cu.clear_tmp_folder()