def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if cm.running_in_CI:
            wl_log.debug("CI run: will not run dumpcap")
            return False
        if pcap_filter:
            self.set_capture_filter(pcap_filter)

        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        if cm.running_in_CI:
            prefix = "sudo "  # run as sudo in Travis CI since we cannot setcap
        command = '{}dumpcap -P -a duration:{} -a filesize:{} -i eth0 -s 0 -f \'{}\' -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise cm.DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
Beispiel #2
0
def extract_links(br):
    """Extract FP related links from the current page."""
    links_to_visit_text = list(
        ut.flatten([
            br.find_elements_by_partial_link_text(linktext)
            for linktext in LINK_LABELS
        ]))
    links_to_visit_url = list(
        ut.flatten([
            br.find_elements_by_xpath('//a[contains(@href,"%s")]' % linkurl)
            for linkurl in LINK_URLS
        ]))
    links_to_visit = [
        link for link in links_to_visit_text + links_to_visit_url if link
    ]

    if len(
            links_to_visit
    ) < NO_OF_LINKS_TO_CLICK:  # if we cannot find links by href and link texts
        links_to_visit += extract_onclick_elements(
            br)  # we search for all elements with onclick event handler
    wl_log.info('%s links were found on %s' %
                (len(links_to_visit), br.current_url))

    return links_to_visit
Beispiel #3
0
def crawl_sites(url_tuples, crawler_type, num_crawl_urls=0, max_parallel_procs=MAX_PARALLEL_PROCESSES):
    if crawler_type == 'lazy':                    
        agent_cfg = AGENT_CFG_PHANTOM_MOD_HOME_PAGE
        agent = HeadlessAgent()
    elif crawler_type == 'clicker':
        agent_cfg = AGENT_CFG_PHANTOM_MOD_CLICKER
        agent = HeadlessAgent()
    elif crawler_type == 'chrome_lazy':
        agent_cfg = AGENT_CFG_CHROME_LAZY
        agent = ChromeAgent()
    elif crawler_type == 'chrome_clicker':
        agent_cfg = AGENT_CFG_CHROME_CLICKER
        agent = ChromeAgent()
    elif crawler_type == 'dnt': # TODO scripts should take DNT as a parameter 
        agent_cfg = AGENT_CFG_DNT_PHANTOM_LAZY
        agent = HeadlessAgent()    
        
    agent.setOptions(agent_cfg)
    cr_job = CrawlJob(agent)
    
    job_cfg = {
              'desc': "Crawl for browser fingerprint detection", 
              'max_parallel_procs': max_parallel_procs,
              'urls':  [],
              'url_tuples':  url_tuples,
              'num_crawl_urls': num_crawl_urls
              }
    
    cr_job.setOptions(job_cfg)
    wl_log.info('Will crawl with agent config: %s and job config: %s' %(agent_cfg, job_cfg))
    run_crawl(cr_job)
    return cr_job.crawl_id
Beispiel #4
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
    def launch_tor_service(self, logfile='/dev/null'):
        """Launch Tor service and return the process."""
        self.log_file = logfile
        self.tmp_tor_data_dir = ut.clone_dir_with_timestap(
            cm.get_tor_data_path(self.tbb_version))

        self.torrc_dict.update({
            'DataDirectory': self.tmp_tor_data_dir,
            'Log': ['INFO file %s' % logfile]
        })

        wl_log.debug("Tor config: %s" % self.torrc_dict)
        try:
            self.tor_process = stem.process.launch_tor_with_config(
                config=self.torrc_dict,
                init_msg_handler=self.tor_log_handler,
                tor_cmd=cm.get_tor_bin_path(self.tbb_version),
                timeout=270)
            self.controller = Controller.from_port()
            self.controller.authenticate()
            return self.tor_process

        except stem.SocketError as exc:
            wl_log.critical("Unable to connect to tor on port %s: %s" %
                            (cm.SOCKS_PORT, exc))
            sys.exit(1)
        except:
            # most of the time this is due to another instance of
            # tor running on the system
            wl_log.critical("Error launching Tor", exc_info=True)
            sys.exit(1)

        wl_log.info("Tor running at port {0} & controller port {1}.".format(
            cm.SOCKS_PORT, cm.CONTROLLER_PORT))
        return self.tor_process
    def quit(self):
        """
        Overrides the base class method cleaning the timestamped profile.

        """
        self.is_running = False
        try:
            wl_log.info("Quit: Removing profile dir")
            shutil.rmtree(self.prof_dir_path)
            super(TorBrowserDriver, self).quit()
        except CannotSendRequest:
            wl_log.error("CannotSendRequest while quitting TorBrowserDriver",
                         exc_info=False)
            # following is copied from webdriver.firefox.webdriver.quit() which
            # was interrupted due to an unhandled CannotSendRequest exception.

            # kill the browser
            self.binary.kill()
            # remove the profile folder
            try:
                shutil.rmtree(self.profile.path)
                if self.profile.tempfolder is not None:
                    shutil.rmtree(self.profile.tempfolder)
            except Exception as e:
                print(str(e))
        except Exception:
            wl_log.error("Exception while quitting TorBrowserDriver",
                         exc_info=True)
    def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if pcap_filter:
            self.set_capture_filter(pcap_filter)
        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        # 修改eth0为本地测试接口WLAN 写成配置文件
        command = '{}dumpcap -P -a duration:{} -a filesize:{} -i {} -s 0 -f \"{}\" -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.netif, self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
    def __do_instance(self):
        for self.job.visit in xrange(self.job.visits):
            ut.create_dir(self.job.path)
            wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url)
            with self.driver.launch():
                try:
                    self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
                except WebDriverException as seto_exc:
                    wl_log.error("Setting soft timeout %s", seto_exc)
                self.__do_visit()

                if self.screenshots:
                    try:
                        self.driver.get_screenshot_as_file(self.job.png_file)
                    except WebDriverException:
                        wl_log.error("Cannot get screenshot.")

                if self.har_export:
                    try:
                        jscript = "return HAR.triggerExport().then(harLog => {return harLog;});"
                        har_string = self.driver.execute_script(jscript)
                        with open(self.job.har_file, 'w') as fd:
                            json.dump(har_string, fd)
                    except WebDriverException:
                        wl_log.error("Cannot export HAR.")

            sleep(float(self.job.config['pause_between_visits']))
            self.post_visit()
    def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if cm.running_in_CI:
            wl_log.debug("CI run: will not run dumpcap")
            return False
        if pcap_filter:
            self.set_capture_filter(pcap_filter)

        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        if cm.running_in_CI:
            prefix = "sudo "  # run as sudo in Travis CI since we cannot setcap
        command = '{}dumpcap -a duration:{} -a filesize:{} -i any -s 0 -f \'{}\' -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise cm.DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
Beispiel #10
0
def crawl_sites(url_tuples, crawler_type, num_crawl_urls=0, max_parallel_procs=MAX_PARALLEL_PROCESSES):
    if crawler_type == 'lazy':                    
        agent_cfg = AGENT_CFG_PHANTOM_MOD_HOME_PAGE
        agent = HeadlessAgent()
    elif crawler_type == 'clicker':
        agent_cfg = AGENT_CFG_PHANTOM_MOD_CLICKER
        agent = HeadlessAgent()
    elif crawler_type == 'chrome_lazy':
        agent_cfg = AGENT_CFG_CHROME_LAZY
        agent = ChromeAgent()
    elif crawler_type == 'chrome_clicker':
        agent_cfg = AGENT_CFG_CHROME_CLICKER
        agent = ChromeAgent()
    elif crawler_type == 'dnt': # TODO scripts should take DNT as a parameter 
        agent_cfg = AGENT_CFG_DNT_PHANTOM_LAZY
        agent = HeadlessAgent()    
        
    agent.setOptions(agent_cfg)
    cr_job = CrawlJob(agent)
    
    job_cfg = {
              'desc': "Crawl for browser fingerprint detection", 
              'max_parallel_procs': max_parallel_procs,
              'urls':  [],
              'url_tuples':  url_tuples,
              'num_crawl_urls': num_crawl_urls
              }
    
    cr_job.setOptions(job_cfg)
    wl_log.info('Will crawl with agent config: %s and job config: %s' %(agent_cfg, job_cfg))
    run_crawl(cr_job)
    return cr_job.crawl_id
Beispiel #11
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
Beispiel #12
0
def close_index_html(index_file):
    # wl_log.info('Will close %s' % index_file)
    # TODO: add check to don't close a file twice 
    if not os.path.isfile(index_file):
        fu.write_to_file(index_file, '') # create an empty file
        
    index_src = fu.read_file(index_file) 
    if index_src.startswith('<html'):
        wl_log.info('Index file %s  already closed' % index_file)
        return
    
    scripts_src = """<script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery-1.9.1.min.js"></script>
    
    <style type="text/css" title="currentStyle">
        @import "../../js/css/demo_page.css";
        @import "../../js/css/demo_table.css";
    </style>
    <script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery.dataTables.min.js"></script>
    <script type="text/javascript" charset="utf-8">
        $(document).ready(function() {
            $('#results').dataTable( {
            "aaSorting": [[ 2, "desc" ]]
            } );
        } );
    </script>"""
        
        
    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + scripts_src + "</head>\n<body><table id ='results'>\
            \n<thead><tr><th>Rank</th><th>Domain</th><th>Fonts</th><th>OffsetWidth</th><th>OffsetHeight</th><th>FP found</th></tr></thead>" +  index_src + '</table></body></html>' 
    
    fu.write_to_file(index_file, html_str)
 def save_checkpoint(self):
     fname = join(cm.CRAWL_DIR, "job.chkpt")
     if isfile(fname):
         remove(fname)
     with open(fname, "w") as f:
         pickle.dump(self.job, f)
     wl_log.info("New checkpoint at %s" % fname)
Beispiel #14
0
def generate_index_file(path):
    table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>'
    fonts_dict = {}
    i = 0
    for json_file in fu.gen_find_files("*.json", path):
        i = i + 1
        wl_log.info("%s - %s" % (i, json_file))
        domaInfo = load_domainfo_from_json_file(json_file)
        if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected:
            fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads

    sorted_font_dict = sorted(fonts_dict.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)

    for filename, num_font_loaded in sorted_font_dict:
        #if num_font_loaded > FONT_LOAD_THRESHOLD:
        rank, domain = get_rank_domain_from_filename(filename)
        output_filename = os.path.basename(filename)[:-4] + ".html"
        table_str += '<tr><td>'+  rank + '</td><td><a href="' + output_filename + '">' + domain \
                + '</a></td><td>' + str(num_font_loaded) +  '</td></tr>'

    table_str += '</table>'

    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>"
    index_filename = os.path.join(path, "index.html")
    fu.write_to_file(index_filename, html_str.encode('utf-8'))
Beispiel #15
0
def generate_index_file(path):
    table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>'
    fonts_dict = {}
    i = 0
    for json_file in fu.gen_find_files("*.json", path):
        i = i + 1
        wl_log.info("%s - %s" % (i, json_file))
        domaInfo = load_domainfo_from_json_file(json_file)
        if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected:
            fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads
            
    sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    for filename, num_font_loaded in sorted_font_dict:
        #if num_font_loaded > FONT_LOAD_THRESHOLD:
        rank,domain = get_rank_domain_from_filename(filename)
        output_filename = os.path.basename(filename)[:-4] + ".html"
        table_str += '<tr><td>'+  rank + '</td><td><a href="' + output_filename + '">' + domain \
                + '</a></td><td>' + str(num_font_loaded) +  '</td></tr>' 
        
    table_str += '</table>'
    
    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>"
    index_filename = os.path.join(path, "index.html")
    fu.write_to_file(index_filename, html_str.encode('utf-8'))
Beispiel #16
0
def close_index_html(index_file):
    # wl_log.info('Will close %s' % index_file)
    # TODO: add check to don't close a file twice
    if not os.path.isfile(index_file):
        fu.write_to_file(index_file, '')  # create an empty file

    index_src = fu.read_file(index_file)
    if index_src.startswith('<html'):
        wl_log.info('Index file %s  already closed' % index_file)
        return

    scripts_src = """<script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery-1.9.1.min.js"></script>
    
    <style type="text/css" title="currentStyle">
        @import "../../js/css/demo_page.css";
        @import "../../js/css/demo_table.css";
    </style>
    <script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery.dataTables.min.js"></script>
    <script type="text/javascript" charset="utf-8">
        $(document).ready(function() {
            $('#results').dataTable( {
            "aaSorting": [[ 2, "desc" ]]
            } );
        } );
    </script>"""

    html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\
            <meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + scripts_src + "</head>\n<body><table id ='results'>\
            \n<thead><tr><th>Rank</th><th>Domain</th><th>Fonts</th><th>OffsetWidth</th><th>OffsetHeight</th><th>FP found</th></tr></thead>" + index_src + '</table></body></html>'

    fu.write_to_file(index_file, html_str)
Beispiel #17
0
    def cleanup_visit(self):
        """Kill sniffer and Tor browser if they're running."""
        wl_log.info("Cleaning up visit.")
        wl_log.info("Cancelling timeout")
        ut.cancel_timeout()

        if self.sniffer and self.sniffer.is_recording:
            wl_log.info("Stopping sniffer...")
            self.sniffer.stop_capture()

        # remove non-tor traffic
        self.filter_guards_from_pcap()

        if self.tb_driver and self.tb_driver.is_running:
            # shutil.rmtree(self.tb_driver.prof_dir_path)
            wl_log.info("Quitting selenium driver...")
            self.tb_driver.quit()

        # close all open streams to prevent pollution
        self.tor_controller.close_all_streams()
        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Stopping display...")
            self.vdisplay.stop()

        # after closing driver and stoping sniffer, we run postcrawl
        self.post_crawl()
Beispiel #18
0
    def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None,
                 experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True):
        self.batch_num = batch_num
        self.site_num = site_num
        self.instance_num = instance_num
        self.page_url = page_url
        self.bg_site = bg_site
        self.experiment = experiment
        self.base_dir = base_dir
        self.visit_dir = None
        self.visit_log_dir = None
        self.tbb_version = cm.RECOMMENDED_TBB_VERSION
        self.capture_screen = capture_screen
        self.tor_controller = tor_controller
        self.xvfb = xvfb
        self.init_visit_dir()
        self.pcap_path = os.path.join(
            self.visit_dir, "{}.pcap".format(self.get_instance_name()))

        if self.xvfb and not cm.running_in_CI:
            wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H))
            self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H)
            self.vdisplay.start()

        # Create new instance of TorBrowser driver
        TorBrowserDriver.add_exception(self.page_url)
        self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH,
                                          tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log"))
        self.sniffer = Sniffer()  # sniffer to capture the network traffic
 def stop_crawl(self, pack_results=True):
     """ Cleans up crawl and kills tor process in case it's running."""
     wl_log.info("Stopping crawl...")
     if self.visit:
         self.visit.cleanup_visit()
     self.tor_controller.kill_tor_proc()
     if pack_results:
         ut.pack_crawl_data(self.crawl_dir)
 def stop_crawl(self, pack_results=True):
     """ Cleans up crawl and kills tor process in case it's running."""
     wl_log.info("Stopping crawl...")
     if self.visit:
         self.visit.cleanup_visit()
     self.tor_controller.kill_tor_proc()
     if pack_results:
         ut.pack_crawl_data(self.crawl_dir)
Beispiel #21
0
def import_gpg_key(key_fp):
    """Import GPG key with the given fingerprint."""
    wl_log.info("Will import the GPG key %s" % key_fp)
    # https://www.torproject.org/docs/verifying-signatures.html.en
    ret_code = subprocess.Popen(['/usr/bin/gpg', '--keyserver',
                                 'x-hkp://pool.sks-keyservers.net',
                                 '--recv-keys', key_fp]).wait()
    return True if ret_code == 0 else False
Beispiel #22
0
def import_gpg_key(key_fp):
    """Import GPG key with the given fingerprint."""
    wl_log.info("Will import the GPG key %s" % key_fp)
    # https://www.torproject.org/docs/verifying-signatures.html.en
    ret_code = subprocess.Popen(['/usr/bin/gpg', '--keyserver',
                                 'x-hkp://pool.sks-keyservers.net',
                                 '--recv-keys', key_fp]).wait()
    return True if ret_code == 0 else False
 def kill_tor_proc(self):
     """Kill Tor process."""
     if self.tor_process:
         wl_log.info("Killing tor process")
         self.tor_process.kill()
     if self.tmp_tor_data_dir and os.path.isdir(self.tmp_tor_data_dir):
         wl_log.info("Removing tmp tor data dir")
         shutil.rmtree(self.tmp_tor_data_dir)
Beispiel #24
0
def get_occurence_vector_from_swf(swf_filename, out_dir=''):
    cum_pattern = [0]*len(FP_ACTIONSCRIPT_STR_LIST)
    for src_file in gen_decompile_swf(swf_filename, out_dir):
        vector = fu.file_occurence_vector(src_file, FP_ACTIONSCRIPT_STR_LIST)
        cum_pattern = [x+y for (x, y) in zip(cum_pattern, vector)]
    
    wl_log.info("Cum Vector for %s %s" % (swf_filename[len(out_dir):], human_readable_occ_vector(cum_pattern)))
    return cum_pattern
Beispiel #25
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):
    
    referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else ""
    
    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them
            
            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()
            
            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)
            
            if not rows:
                swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'
                    
                wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer))
                
                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1
            
            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()
            
            swf_info.rank = rank # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer        
            swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) 
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id
            
            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()
            
            
        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100]))
        else:
            pass
 def _new_identity(self):
     wl_log.info("Creating a new identity...")
     try:
         ActionChains(self.driver).send_keys(Keys.CONTROL + Keys.SHIFT +
                                             'U').perform()
     except WebDriverException:
         pass
     except Exception:
         wl_log.exception("Exception while creating new identity.")
Beispiel #27
0
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
Beispiel #28
0
def init_mitmproxy(basename, timeout, logging):
    try:
        port, pid = run_mitmdump(basename, timeout+1, logging) # runs a mitmdump process with the timeout+1 sec
    except:
        wl_log.critical('Exception initializing mitmdump')
    else:
        wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid))

    return "127.0.0.1:%s " % port if port and pid else ""
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
 def filter_packets_without_guard_ip(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.info("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_tshark(self.job.tshark_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering tshark log: %s.", e)
         wl_log.error("Check tshark log: %s", self.job.thsark_file)
 def stop_capture(self):
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     os.kill(self.p0.pid, signal.SIGINT)
     #self.p0.kill()
     if os.path.isfile(self.logpath):
         wl_log.info('Onionperf killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.logpath), self.logpath))
     else:
         wl_log.warning('Onionperf killed but cannot find capture file: %s'
                        % self.logpath)
Beispiel #32
0
def get_occurence_vector_from_swf(swf_filename, out_dir=''):
    cum_pattern = [0] * len(FP_ACTIONSCRIPT_STR_LIST)
    for src_file in gen_decompile_swf(swf_filename, out_dir):
        vector = fu.file_occurence_vector(src_file, FP_ACTIONSCRIPT_STR_LIST)
        cum_pattern = [x + y for (x, y) in zip(cum_pattern, vector)]

    wl_log.info(
        "Cum Vector for %s %s" %
        (swf_filename[len(out_dir):], human_readable_occ_vector(cum_pattern)))
    return cum_pattern
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     if os.path.isfile(self.pcap_file):
         wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     else:
         wl_log.warning('Dumpcap killed but cannot find capture file: %s'
                        % self.pcap_file)
Beispiel #34
0
def init_mitmproxy(basename, timeout, logging):
    try:
        port, pid = run_mitmdump(
            basename, timeout + 1,
            logging)  # runs a mitmdump process with the timeout+1 sec
    except:
        wl_log.critical('Exception initializing mitmdump')
    else:
        wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid))

    return "127.0.0.1:%s " % port if port and pid else ""
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     if os.path.isfile(self.pcap_file):
         wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     else:
         wl_log.warning('Dumpcap killed but cannot find capture file: %s' %
                        self.pcap_file)
Beispiel #36
0
def write_decomp_log(swf_path, ffdec_status, ffdec_output, flare_status, flare_output):
    TIME_OUT_EXIT_CODE = -1 # return code for linux timeout command
    
    log_str = 'Decompile: %s || ' % os.path.basename(swf_path)
    log_str += 'ffdec: %d' % ffdec_status
    if ffdec_status:
        log_str += ' timeout' if ffdec_status == TIME_OUT_EXIT_CODE else ' error'
    else:
        log_str += ' OK'
        
    wl_log.info(log_str)
Beispiel #37
0
def extract_links(br):
    """Extract FP related links from the current page."""
    links_to_visit_text = list(ut.flatten([br.find_elements_by_partial_link_text(linktext) for linktext in LINK_LABELS]))
    links_to_visit_url = list(ut.flatten([br.find_elements_by_xpath('//a[contains(@href,"%s")]' % linkurl) for linkurl in LINK_URLS]))
    links_to_visit = [link for link in links_to_visit_text + links_to_visit_url if link]
    
    if len(links_to_visit) < NO_OF_LINKS_TO_CLICK: # if we cannot find links by href and link texts
        links_to_visit += extract_onclick_elements(br)  # we search for all elements with onclick event handler
    wl_log.info('%s links were found on %s' % (len(links_to_visit), br.current_url))
    
    return links_to_visit
Beispiel #38
0
def write_decomp_log(swf_path, ffdec_status, ffdec_output, flare_status,
                     flare_output):
    TIME_OUT_EXIT_CODE = -1  # return code for linux timeout command

    log_str = 'Decompile: %s || ' % os.path.basename(swf_path)
    log_str += 'ffdec: %d' % ffdec_status
    if ffdec_status:
        log_str += ' timeout' if ffdec_status == TIME_OUT_EXIT_CODE else ' error'
    else:
        log_str += ' OK'

    wl_log.info(log_str)
Beispiel #39
0
 def take_screenshot(self):
     try:
         out_png = os.path.join(self.visit_dir, 'screenshot.png')
         wl_log.info("Taking screenshot of %s to %s" % (self.page_url,
                                                        out_png))
         self.tb_driver.get_screenshot_as_file(out_png)
         if cm.running_in_CI:
             wl_log.debug("Screenshot data:image/png;base64,%s"
                          % self.tb_driver.get_screenshot_as_base64())
     except:
         wl_log.info("Exception while taking screenshot of: %s"
                     % self.page_url)
 def _mark_failed(self):
     url = self.job.url
     if url in self.job.batch_failed and not self.job.batch_failed[url]:
         self.job.batch_failed[url] = True
         wl_log.info(
             "Visit to %s in %s different batches: won't visit the url again"
             % (url, MAX_BATCH_FAILED))
     else:
         self.job.batch_failed[url] = False
         wl_log.info(
             "Visit to %s failed %s times within this batch, will skip for this batch"
             % (url, MAX_FAILED))
Beispiel #41
0
 def take_screenshot(self):
     try:
         out_png = os.path.join(self.visit_dir, 'screenshot.png')
         wl_log.info("Taking screenshot of %s to %s" %
                     (self.page_url, out_png))
         self.tb_driver.get_screenshot_as_file(out_png)
         if cm.running_in_CI:
             wl_log.debug("Screenshot data:image/png;base64,%s" %
                          self.tb_driver.get_screenshot_as_base64())
     except:
         wl_log.info("Exception while taking screenshot of: %s" %
                     self.page_url)
Beispiel #42
0
def download_tbb_tarball(tbb_ver, dl_dir=""):
    tbb_url = get_url_by_tbb_ver(tbb_ver)
    base_dir = dl_dir if dl_dir else cm.TBB_BASE_DIR
    tarball_path = os.path.join(base_dir, get_tbb_filename(tbb_ver))
    if not os.path.isfile(tarball_path):
        wl_log.info("Will download %s to %s" % (tbb_url, tarball_path))
        ut.download_file(tbb_url, tarball_path)
        ut.extract_tbb_tarball(tarball_path)
    if verify_tbb_tarball(tbb_ver, tarball_path, tbb_url):
        return tarball_path
    # we cannot verify the integrity of the downloaded tarball
    raise cm.TBBTarballVerificationError("Cannot verify the integrity of %s"
                                         % tarball_path)
Beispiel #43
0
def download_tbb_tarball(tbb_ver, dl_dir=""):
    tbb_url = get_url_by_tbb_ver(tbb_ver)
    base_dir = dl_dir if dl_dir else cm.TBB_BASE_DIR
    tarball_path = os.path.join(base_dir, get_tbb_filename(tbb_ver))
    if not os.path.isfile(tarball_path):
        wl_log.info("Will download %s to %s" % (tbb_url, tarball_path))
        ut.download_file(tbb_url, tarball_path)
        ut.extract_tbb_tarball(tarball_path)
    if verify_tbb_tarball(tbb_ver, tarball_path, tbb_url):
        return tarball_path
    # we cannot verify the integrity of the downloaded tarball
    raise cm.TBBTarballVerificationError("Cannot verify the integrity of %s"
                                         % tarball_path)
Beispiel #44
0
def get_and_sleep(br, page_url):
    """Load page and sleep for a while."""
    try:
        start_time = time()
        br.get(page_url)
        elapsed_time = time() - start_time
        wl_log.info("Page %s loaded in %s" % (page_url, elapsed_time))
    except Exception as exc:
        wl_log.info('Error loading page %s %s' % (page_url, exc))
        br.quit()
    else:
        #wl_log.info('Will sleep after reload %s' % page_url)
        br.execute_script("window.onbeforeunload = function(e){};")
        sleep(SLEEP_AFTER_PAGE_LOAD)
Beispiel #45
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename + '.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try:
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(
                    msg, crawl_id
                )  # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as exc:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)

    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(
        os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)

    # parse
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION

    insert_js_fun = functools.partial(lp.insert_js_info_to_db,
                                      site_info_id=site_info_id,
                                      db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun,
                       crawl_id)  # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
Beispiel #46
0
def click_crawler(br, page_url):
    num_clicked_links = 0
    get_and_sleep(br, page_url)
    wl_log.info("Click crawler will click on %s" % page_url)
    for label in LINK_LABELS:
        num_clicked_links += click_to_xpath_selector(br, page_url, '//a[contains(text(), "'+ label + '")]')
        #num_clicked_links += click_by_link_text(br, page_url, label)
    
    for url_string in LINK_URLS:
        num_clicked_links += click_to_xpath_selector(br, page_url, '//a[contains(@href,"' + url_string + '")]')
    
    wl_log.info("Clicked %s links, will click %s more" % (num_clicked_links, MAX_LINKS_TO_CLICK - num_clicked_links))
    for i in xrange(1, MAX_LINKS_TO_CLICK - num_clicked_links):
        click_to_xpath_selector(br, page_url, "(//a|//*[@onclick])[position()=%s]" % i)
 def _do_instance(self):
     with Onionperf(self.job.onionperf_log):
         with TShark(device=self.device,
                     path=self.job.pcap_file,
                     filter=cm.DEFAULT_FILTER):
             sleep(1)  # make sure sniffer is running
             with ut.timeout(cm.HARD_VISIT_TIMEOUT):
                 wl_log.info("Visiting: %s" % self.job.url)
                 self.driver.get(self._get_url())
                 self.page_source = self.driver.page_source.encode(
                     'utf-8').strip().lower()
                 self.save_page_source()
                 self.check_conn_error()
                 self.check_captcha()
                 sleep(float(self.job.config['pause_in_site']))  # TODO
Beispiel #48
0
def get_and_sleep(br, page_url):
    """Load page and sleep for a while."""
    try:                 
        start_time = time()
        br.get(page_url)
        elapsed_time = time() - start_time
        wl_log.info("Page %s loaded in %s" % (page_url, elapsed_time))                 
    except Exception as exc:
        wl_log.info('Error loading page %s %s' % (page_url, exc))
        br.quit()
    else:
        #wl_log.info('Will sleep after reload %s' % page_url)
        br.execute_script("window.onbeforeunload = function(e){};")
        sleep(SLEEP_AFTER_PAGE_LOAD)        
            
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     captcha_filepath = ut.capture_filepath_to_captcha(self.pcap_file)
     if os.path.isfile(self.pcap_file):
         wl_log.info('Sniffer killed. Capture size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     elif os.path.isfile(captcha_filepath):
         wl_log.info(
             'Sniffer killed, file renamed to captcha_*. Capture size: %s Bytes %s'
             % (os.path.getsize(captcha_filepath), captcha_filepath))
     else:
         wl_log.warning(
             'Sniffer killed but cannot find capture file: %s or %s' %
             (self.pcap_file, captcha_filepath))
Beispiel #50
0
def click_to_xpath_selector(br, page_url, selector):
    #wl_log.info('Will find els by selector %s on %s' % (selector, page_url))
    els = br.find_elements_by_xpath(selector)
    for el in els:
        if is_clickable(el, page_url):
            href = el.get_attribute('href') or "?"
            try:
                el.click()
            except Exception as es:
                wl_log.warning('Exception while clicking: href: %s %s %s' % (href, es, page_url))                
            else:
                wl_log.info('Clicked!: href: %s %s %s' % (href, selector, page_url))
                sleep(WAIT_AFTER_CLICK)
                get_and_sleep(br, page_url)
                return 1
    #wl_log.debug('No clickable element found for: %s %s' % (selector, page_url))
    return 0 # we couldn't find any element to click
Beispiel #51
0
 def __do_instance(self):
     for self.job.visit in xrange(self.job.visits):
         ut.create_dir(self.job.path)
         wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url)
         with self.driver.launch():
             try:
                 self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
             except WebDriverException as seto_exc:
                 wl_log.error("Setting soft timeout %s", seto_exc)
             self.__do_visit()
             if self.screenshots:
                 try:
                     self.driver.get_screenshot_as_file(self.job.png_file)
                 except WebDriverException:
                     wl_log.error("Cannot get screenshot.")
         sleep(float(self.job.config['pause_between_visits']))
         self.post_visit()
Beispiel #52
0
def click_crawler(br, page_url):
    num_clicked_links = 0
    get_and_sleep(br, page_url)
    wl_log.info("Click crawler will click on %s" % page_url)
    for label in LINK_LABELS:
        num_clicked_links += click_to_xpath_selector(
            br, page_url, '//a[contains(text(), "' + label + '")]')
        #num_clicked_links += click_by_link_text(br, page_url, label)

    for url_string in LINK_URLS:
        num_clicked_links += click_to_xpath_selector(
            br, page_url, '//a[contains(@href,"' + url_string + '")]')

    wl_log.info("Clicked %s links, will click %s more" %
                (num_clicked_links, MAX_LINKS_TO_CLICK - num_clicked_links))
    for i in xrange(1, MAX_LINKS_TO_CLICK - num_clicked_links):
        click_to_xpath_selector(br, page_url,
                                "(//a|//*[@onclick])[position()=%s]" % i)
Beispiel #53
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename +'.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try: 
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as _:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)
    
    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)
    
    # parse 
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION
        
    insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
Beispiel #54
0
def run_mitmdump(basename, timeout, logging=False):
    """Run mitmdump as a subprocess in the background with a timeout."""
    port = get_free_port()
    if not port: # we cannot get a free port
        return None, None
    
    dump_file = "%s.dmp" % basename
    cmd_re_dir = '' # for redirecting stderr to stdout and teeing
    quite_option = '-q' # mitmdump option to be quiet - no log
    
    if logging:
        mitm_log_file = "%s.%s" % (basename, MITM_LOG_EXTENSION)
        cmd_re_dir = ' 2>&1 |tee %s' % mitm_log_file # redirect all output to log file
        quite_option = '' # we don't want be quite!
        
    cmd = 'timeout %s mitmdump %s -z --anticache -p %s -w %s %s' % (timeout, quite_option, port, dump_file, cmd_re_dir)
    # -z: Try to convince servers to send us uncompressed data. mitmdump -h | grep "\-z" for info
    
    wl_log.info('mitmdump cmd %s' % cmd)
    subp = subprocess.Popen(cmd, shell=True) # shell=True - must be careful
    return port, subp.pid
Beispiel #55
0
def crawl_url(crawler_type, page_url, proxy_opt):
    
    if 'clicker' in crawler_type:
        worker = click_crawler
    else:
        worker = lazy_crawler
    
    br = init_browser('chrome', ['--allow-running-insecure-content', '--ignore-certificate-errors', '--disk-cache-size=0', \
                                         '--enable-logging', '--v=1', "--proxy-server=%s" % proxy_opt])
            
    if not page_url.startswith('http') and not page_url.startswith('file:'): 
        page_url = 'http://' + page_url
        
    wl_log.info('***Will crawl  %s***' % page_url)
    
    try:
        ut.timeout(CRAWLER_CLICKER_VISIT_TIMEOUT)
        worker(br, page_url) # run the worker function
    except ut.TimeExceededError as texc:
        wl_log.critical('***CRAWLER_CLICKER_VISIT_TIMEOUT at %s (%s)' % (page_url, texc))
    finally:    
        br.quit()
Beispiel #56
0
 def crawl(self, job):
     """Crawls a set of urls in batches."""
     self.job = job
     wl_log.info("Starting new crawl")
     wl_log.info(pformat(self.job))
     for self.job.batch in xrange(self.job.batches):
         wl_log.info("**** Starting batch %s ***" % self.job.batch)
         self.__do_batch()
         sleep(float(self.job.config['pause_between_batches']))
Beispiel #57
0
    def get_multitab(self):
        """Open two tab, use one to load a background site and the other to
        load the real site."""
        PAUSE_BETWEEN_TAB_OPENINGS = 0.5
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to kill running procs
        # load a blank page - a page is needed to send keys to the browser
        self.tb_driver.get(BAREBONE_HOME_PAGE)
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)

        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {} with {} in the background".
                    format(self.page_url, self.bg_site))

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab
        # now that the focus is on the address bar, load the background
        # site by "typing" it to the address bar and "pressing" ENTER (\n)
        # simulated by send_keys function
        body.send_keys('%s\n' % self.bg_site)

        # the delay between the loading of background and real sites
        time.sleep(PAUSE_BETWEEN_TAB_OPENINGS)

        body = self.tb_driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')  # open a new tab

        t1 = time.time()
        self.tb_driver.get(self.page_url)  # load the real site in the 2nd tab

        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()
Beispiel #58
0
    def get_wang_and_goldberg(self):
        """Visit the site according to Wang and Goldberg (WPES'13) settings."""
        ut.timeout(cm.HARD_VISIT_TIMEOUT)  # set timeout to stop the visit
        self.sniffer.start_capture(self.pcap_path,
                                   'tcp and not host %s and not tcp port 22 and not tcp port 20'
                                   % LOCALHOST_IP)
        time.sleep(cm.PAUSE_BETWEEN_INSTANCES)
        try:
            self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
        except:
            wl_log.info("Exception setting a timeout {}".format(self.page_url))

        wl_log.info("Crawling URL: {}".format(self.page_url))

        t1 = time.time()
        self.tb_driver.get(self.page_url)
        page_load_time = time.time() - t1
        wl_log.info("{} loaded in {} sec"
                    .format(self.page_url, page_load_time))
        time.sleep(cm.WAIT_IN_SITE)
        if self.capture_screen:
            self.take_screenshot()
        self.cleanup_visit()
Beispiel #59
0
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""):
    """Populate domain info object by parsing crawl log file of a site.
    Call dump function to output dump log.
    
    Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. 
    See, fontconfig library for details.  
    
    """
    origins_to_fonts = {}  # will keep origin to loaded fonts mapping

    domaInfo = DomainInfo()

    file_content = fu.read_file(filename)
    wl_log.info("Parsing log for %s %s" % (url, filename))

    fonts_by_fc_debug = re.findall(
        r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE
    )  # match family field of font request (not the matched one)
    domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content))  # offset width attempts
    domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content))  # offset height attempts
    # TODO add getBoundingClientRect

    font_and_urls = re.findall(
        r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content
    )  # output from modified browser
    # print 'font_and_urls', font_and_urls

    font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content)  # output from modified browser
    # print 'font_and_urls', font_and_urls
    domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content)))  # output from modified browser
    # print 'domaInfo.log_complete', domaInfo.log_complete
    js_log_prefix = ">>>FPLOG"
    fpd_logs = re.findall(r"%s.*" % js_log_prefix, file_content)  # output from modified browser
    domaInfo.fpd_logs = [call[len(js_log_prefix) + 1 :] for call in set(fpd_logs)]

    for font_name, font_url in font_and_urls:
        if font_url.startswith("http") and len(font_name) > 1 and not font_name[:5] in ("data:", "http:", "https"):
            # font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line
            font_name = font_name.lower().strip()
            #             origin = pub_suffix.get_public_suffix(font_url)\
            origin = font_url
            if origin in origins_to_fonts:
                origins_to_fonts[origin].add(font_name)
                # print 'added', font_name, 'to', origin, origins_to_fonts[origin]
            else:
                origins_to_fonts[origin] = set([font_name])

    for font, face in font_face_pairs:
        font = font.lower().strip()
        face = face.lower().strip()
        # replace all occurrences of this font-family name with the face
        for fonts_by_origin in origins_to_fonts.itervalues():
            try:
                fonts_by_origin.remove(font)
            except:  # we cannot find this font in this origin's list
                pass
            else:
                fonts_by_origin.add(face)
                # print 'removed', font, 'added', face

    for origin, fonts in origins_to_fonts.iteritems():
        domaInfo.fonts_by_origins[origin] = list(fonts)
        domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin]

    domaInfo.fc_dbg_font_loads = list(
        set([font.lower() for font in fonts_by_fc_debug if not font[:5] in ("data:", "http:", "https")])
    )  # filter out the data urls and web fonts

    domaInfo.fonts_loaded = list(
        set([font.lower() for font in domaInfo.fonts_loaded if not font[:5] in ("data:", "http:", "https")])
    )  # filter out the data urls and web fonts

    requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE)
    if not requests and filename.endswith(MITM_LOG_EXTENSION):
        requests = re.findall(r"(http.*)", file_content, re.MULTILINE)
    responses = ""
    # populate domain info obj

    domaInfo.num_font_loads = len(domaInfo.fonts_loaded)
    domaInfo.requests = list(set(requests))
    domaInfo.responses = list(set(responses))
    domaInfo.fp_detected = get_fp_from_reqs(requests)
    domaInfo.url = url
    domaInfo.rank = get_rank_domain_from_filename(filename)[
        0
    ]  # !!! rank may not be right. It's only true if we make top Alexa crawl.
    domaInfo.log_filename = filename
    domaInfo.crawl_id = crawl_id

    # Read canvas events and print them to log in canvas
    urls_read_from_canvas = Set()
    urls_wrote_to_canvas = Set()

    canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER, str(crawl_id) + "canvas.log")
    read = wrote = False
    for read_event in cm.CANVAS_READ_EVENTS:
        if read_event in file_content:
            read = True
            break
    for write_event in cm.CANVAS_WRITE_EVENTS:
        if write_event in file_content:
            wrote = True
            break

    if read and wrote:
        wl_log.info("Found both canvas read and write events in log %s, registering in : %s" % (filename, canvas_log))
        with open(canvas_log, "a+") as f:
            f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n")

    if dump_fun:  # call dump function
        try:
            dump_fun(domaInfo)
        except KeyboardInterrupt:
            raise
        except Exception as exc:
            wl_log.exception("Exception while dumping %s: %s" % (domaInfo.url, exc))
Beispiel #60
0
def parse_crawl_logs(path, no_of_procs=16):
    files = fu.gen_find_files("*.txt", path)
    log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html)
    parallelize.run_in_parallel(files, log_worker, no_of_procs)
    wl_log.info("Worker processes are finished, will generate index")