def quit(self):
        """
        Overrides the base class method cleaning the timestamped profile.

        """
        self.is_running = False
        try:
            wl_log.info("Quit: Removing profile dir")
            shutil.rmtree(self.prof_dir_path)
            super(TorBrowserDriver, self).quit()
        except CannotSendRequest:
            wl_log.error("CannotSendRequest while quitting TorBrowserDriver",
                         exc_info=False)
            # following is copied from webdriver.firefox.webdriver.quit() which
            # was interrupted due to an unhandled CannotSendRequest exception.

            # kill the browser
            self.binary.kill()
            # remove the profile folder
            try:
                shutil.rmtree(self.profile.path)
                if self.profile.tempfolder is not None:
                    shutil.rmtree(self.profile.tempfolder)
            except Exception as e:
                print(str(e))
        except Exception:
            wl_log.error("Exception while quitting TorBrowserDriver",
                         exc_info=True)
Beispiel #2
0
def clone_dir_with_timestap(orig_dir_path):
    """Copy a folder into the same directory and append a timestamp."""
    new_dir = create_dir(append_timestamp(orig_dir_path))
    try:
        du.copy_tree(orig_dir_path, new_dir)
    except Exception, e:
        wl_log.error("Error while cloning the dir with timestamp" + str(e))
Beispiel #3
0
def clone_dir_with_timestap(orig_dir_path):
    """Copy a folder into the same directory and append a timestamp."""
    new_dir = create_dir(append_timestamp(orig_dir_path))
    try:
        du.copy_tree(orig_dir_path, new_dir)
    except Exception, e:
        wl_log.error("Error while cloning the dir with timestamp" + str(e))
 def get_screenshot_if_enabled(self):
     if self.screenshots:
         try:
             with ut.timeout(5):
                 self.driver.get_screenshot_as_file(self.job.png_file)
         except Exception:
             wl_log.error("Cannot get screenshot.")
Beispiel #5
0
def main():
    # Parse arguments
    args = parse_arguments()

    # Read URLs
    url_list = read_list_urls(args)

    # Get torrc matching experiment type
    torrc_dict = cm.TORRC_BY_TYPE[args.experiment]

    # Instantiate crawler
    crawler = Crawler(url_list, torrc_dict,
                      output=args.output,
                      experiment=args.experiment,
                      xvfb=args.xvfb,
                      capture_screen=True)

    # Run the crawl
    try:
        crawler.crawl(args.batches, args.instances,
                      start_line=args.start_line - 1)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
    except Exception as e:
        wl_log.error("Exception: \n%s" % (traceback.format_exc()))
    finally:
        crawler.stop_crawl()
 def filter_packets_without_guard_ip(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.info("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_tshark(self.job.tshark_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering tshark log: %s.", e)
         wl_log.error("Check tshark log: %s", self.job.thsark_file)
Beispiel #7
0
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
 def init_tbb_profile(self, version):
     profile_directory = cm.get_tbb_profile_path(version)
     self.prof_dir_path = clone_dir_with_timestap(profile_directory)
     if self.capture_screen and self.page_url:
         self.add_canvas_permission()
     try:
         tbb_profile = webdriver.FirefoxProfile(self.prof_dir_path)
     except Exception:
         wl_log.error("Error creating the TB profile", exc_info=True)
     else:
         return tbb_profile
Beispiel #10
0
 def __do_visit(self):
     with Sniffer(path=self.job.pcap_file, filter=cm.DEFAULT_FILTER):
         sleep(1)  # make sure dumpcap is running
         try:
             with ut.timeout(cm.HARD_VISIT_TIMEOUT):
                 self.driver.get(self.job.url)
                 sleep(float(self.job.config['pause_in_site']))
         except (cm.HardTimeoutException, TimeoutException):
             wl_log.error("Visit to %s has timed out!", self.job.url)
         except Exception as exc:
             wl_log.error("Unknown exception: %s", exc)
 def __do_visit(self):
     with Sniffer(path=self.job.pcap_file, filter=cm.DEFAULT_FILTER):
         sleep(1)  # make sure dumpcap is running
         try:
             with ut.timeout(cm.HARD_VISIT_TIMEOUT):
                 self.driver.get(self.job.url)
                 sleep(float(self.job.config['pause_in_site']))
         except (cm.HardTimeoutException, TimeoutException):
             wl_log.error("Visit to %s has timed out!", self.job.url)
         except Exception as exc:
             wl_log.error("Unknown exception: %s", exc)
Beispiel #12
0
def crawl_urls(br_type, urls, fn=lambda x: x):
    for url in urls:
        try:
            br = init_browser(br_type)
        except:
            wl_log.critical('Init browser')
        else:
            try:
                crawl_url(br, url, fn)
            except Exception as e:
                wl_log.error("Error crawling %s: %s" % (url, e))
            br.quit()
Beispiel #13
0
def crawl_urls(br_type, urls, fn=lambda x:x):
    for url in urls:
        try:
            br = init_browser(br_type)
        except:
            wl_log.critical('Init browser')
        else:
            try:
                crawl_url(br, url, fn)
            except Exception as e:
                wl_log.error("Error crawling %s: %s" %(url, e))
            br.quit()
def parse_url_list(file_path, start, stop):
    """Return list of urls from a file."""
    url_list = []
    try:
        with open(file_path) as f:
            file_contents = f.read()
            url_list = file_contents.splitlines()
            url_list = url_list[start - 1:stop]
    except Exception as e:
        wl_log.error("ERROR: while parsing URL list: {} \n{}".format(e, traceback.format_exc()))
        sys.exit(1)
    return url_list
Beispiel #15
0
 def __do_instance(self):
     for self.job.visit in xrange(self.job.visits):
         ut.create_dir(self.job.path)
         wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url)
         with self.driver.launch():
             try:
                 self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
             except WebDriverException as seto_exc:
                 wl_log.error("Setting soft timeout %s", seto_exc)
             self.__do_visit()
             if self.screenshots:
                 try:
                     self.driver.get_screenshot_as_file(self.job.png_file)
                 except WebDriverException:
                     wl_log.error("Cannot get screenshot.")
         sleep(float(self.job.config['pause_between_visits']))
         self.post_visit()
Beispiel #16
0
def extract_tbb_tarball(archive_path):
    arch_dir = os.path.dirname(archive_path)
    extracted_dir = os.path.join(arch_dir, "tor-browser_en-US")
    tar_cmd = "tar xvf %s -C %s" % (archive_path, arch_dir)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or not os.path.isdir(extracted_dir):
        wl_log.error("Error extracting TBB tarball %s: (%s: %s)"
                     % (tar_cmd, status, txt))
        return False
    dest_dir = archive_path.split(".tar")[0]
    mv_cmd = "mv %s %s" % (extracted_dir, dest_dir)
    status, txt = commands.getstatusoutput(mv_cmd)
    if status or not os.path.isdir(dest_dir):
        wl_log.error("Error moving extracted TBB with the command %s: (%s: %s)"
                     % (mv_cmd, status, txt))
        return False
    return True
Beispiel #17
0
def extract_tbb_tarball(archive_path):
    arch_dir = os.path.dirname(archive_path)
    extracted_dir = os.path.join(arch_dir, "tor-browser_en-US")
    tar_cmd = "tar xvf %s -C %s" % (archive_path, arch_dir)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or not os.path.isdir(extracted_dir):
        wl_log.error("Error extracting TBB tarball %s: (%s: %s)" %
                     (tar_cmd, status, txt))
        return False
    dest_dir = archive_path.split(".tar")[0]
    mv_cmd = "mv %s %s" % (extracted_dir, dest_dir)
    status, txt = commands.getstatusoutput(mv_cmd)
    if status or not os.path.isdir(dest_dir):
        wl_log.error(
            "Error moving extracted TBB with the command %s: (%s: %s)" %
            (mv_cmd, status, txt))
        return False
    return True
Beispiel #18
0
 def __do_instance(self):
     for self.job.visit in range(self.job.visits):
         ut.create_dir(self.job.path)
         wl_log.info("*** Visit #%s to %s ***", self.job.visit,
                     self.job.url)
         # BrowserWrapper开始实际地构造一个driver对象
         # __enter__开启一个新浏览器 和 __exit__时退出
         # 问题:每次driver.quit()后 新开启的driver会重用之前临时创建的profile
         with self.driver.launch():
             try:
                 self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
             except WebDriverException as seto_exc:
                 wl_log.error("Setting soft timeout %s", seto_exc)
             self.__do_visit()
             if self.screenshots:
                 try:
                     self.driver.get_screenshot_as_file(self.job.png_file)
                 except WebDriverException:
                     wl_log.error("Cannot get screenshot.")
         sleep(float(self.job.config['pause_between_visits']))
         self.post_visit()
Beispiel #19
0
 def filter_guards_from_pcap(self):
     guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the concensus.", len(guard_ips))
     orig_pcap = self.pcap_path + ".original"
     copyfile(self.pcap_path, orig_pcap)
     try:
         preader = PcapReader(orig_pcap)
         pcap_filtered = []
         for p in preader:
             if IP not in p:
                 pcap_filtered.append(p)
                 continue
             ip = p.payload
             if ip.dst in guard_ips or ip.src in guard_ips:
                 pcap_filtered.append(p)
         wrpcap(self.pcap_path, pcap_filtered)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s",
                      e, orig_pcap)
     else:
         os.remove(orig_pcap)
    def __do_instance(self):
        for self.job.visit in xrange(self.job.visits):
            ut.create_dir(self.job.path)
            wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url)
            with self.driver.launch():
                try:
                    self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
                except WebDriverException as seto_exc:
                    wl_log.error("Setting soft timeout %s", seto_exc)
                self.__do_visit()

                if self.screenshots:
                    try:
                        self.driver.get_screenshot_as_file(self.job.png_file)
                    except WebDriverException:
                        wl_log.error("Cannot get screenshot.")

                if self.har_export:
                    try:
                        jscript = "return HAR.triggerExport().then(harLog => {return harLog;});"
                        har_string = self.driver.execute_script(jscript)
                        with open(self.job.har_file, 'w') as fd:
                            json.dump(har_string, fd)
                    except WebDriverException:
                        wl_log.error("Cannot export HAR.")

            sleep(float(self.job.config['pause_between_visits']))
            self.post_visit()
Beispiel #21
0
def die(last_words='Unknown problem, quitting!'):
    wl_log.error(last_words)
    sys.exit(1)
Beispiel #22
0
def die(last_words="Unknown problem, quitting!"):
    wl_log.error(last_words)
    sys.exit(1)
def run():
    # Parse arguments
    args, config = parse_arguments()

    # build dirs
    build_crawl_dirs()

    # Read URLs
    if isfile(args.urls):
        url_list = parse_url_list(args.urls, args.start, args.stop)
    else:
        try:
            url_list = args.urls.split(',')
        except Exception as e:
            wl_log.error("ERROR: expects a string with comma-separated list "
                         "of URLs of a path to file")
    host_list = [urlparse(url).hostname for url in url_list]

    # Configure logger
    add_log_file_handler(wl_log, cm.CRAWL_LOG_FILENAME)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(tbb_path=args.tbb_path,
                               tor_binary_path=args.tor_binary_path,
                               tor_data_path=args.tor_data_path,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    ffprefs = ut.set_dict_value_types(ffprefs)
    print(ffprefs)
    addons_path = [abspath(args.addons_dir)] if args.addons_dir else []
    driver_config = {'tbb_path': cm.TBB_DIR,
                     'tor_cfg': USE_RUNNING_TOR,
                     'pref_dict': ffprefs,
                     'extensions': addons_path,
                     'socks_port': int(torrc_config['socksport']),
                     'control_port': int(torrc_config['controlport']),
                     'canvas_allowed_hosts': host_list
                     }

    # Instantiate crawler
    crawl_type = getattr(crawler_mod, "Crawler" + args.type)
    crawler = crawl_type(controller,
                         driver_config=driver_config,
                         device=args.device,
                         screenshots=args.screenshots)

    # Configure crawl
    if args.recover_file is not None:
        if isfile(args.recover_file):
            with open(args.recover_file) as fchkpt:
                job = pickle.load(fchkpt)
                wl_log.info("Job recovered: %s" % str(job))
        else:
            wl_log.error("Checkpoint file %s does not exist" % args.recover_file)
            sys.exit(1)
    else:
        # parse job configuration
        job_config = ut.get_dict_subconfig(config, args.config, "job")

        # get chunk of urls to crawl
        chunk = int(job_config.get('chunk', 0))
        chunks = int(job_config.get('chunks', 1))
        range_chunk = len(url_list) / chunks
        if chunk == chunks - 1: # last chunk takes remaining urls
            url_list_chunk = url_list[chunk * range_chunk:]
        else:
            url_list_chunk = url_list[chunk * range_chunk:(chunk + 1) * range_chunk]
        job = crawler_mod.CrawlJob(job_config, url_list_chunk)

    # Run display
    xvfb_display = setup_virtual_display(args.virtual_display)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    except Exception as e:
        wl_log.error("ERROR: unknown exception while crawling: %s" % e)
    finally:
        #driver.quit()
        #controller.quit()
        # Post crawl
        post_crawl()

        # Close display
        ut.stop_xvfb(xvfb_display)

    # die
    wl_log.info("[tbcrawler] the crawl has finished.")
    sys.exit(0)
Beispiel #24
0
    def __init__(self,
                 tbb_binary_path=None,
                 tbb_profile_dir=None,
                 tbb_logfile_path=None,
                 tbb_version=cm.TBB_DEFAULT_VERSION,
                 page_url='',
                 capture_screen=True):
        self.is_running = False
        self.tbb_version = tbb_version
        self.export_lib_path()
        # Initialize Tor Browser's profile
        self.page_url = page_url
        self.capture_screen = capture_screen
        self.profile = self.init_tbb_profile(tbb_version)
        # set homepage to a blank tab
        self.profile.set_preference('browser.startup.page', '0')
        self.profile.set_preference('browser.startup.homepage', 'about:newtab')

        # configure Firefox to use Tor SOCKS proxy
        self.profile.set_preference('network.proxy.type', 1)
        self.profile.set_preference('network.proxy.socks', '127.0.0.1')
        self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT)
        if cm.DISABLE_RANDOMIZEDPIPELINENING:
            self.profile.set_preference(
                'network.http.pipelining.max-optimistic-requests', 5000)
            self.profile.set_preference('network.http.pipelining.maxrequests',
                                        15000)
            self.profile.set_preference('network.http.pipelining', False)
        self.profile.set_preference('extensions.torlauncher.prompt_at_startup',
                                    0)

        # Disable cache - Wang & Goldberg's setting
        self.profile.set_preference('network.http.use-cache', False)
        self.profile.set_preference('webdriver.load.strategy', 'conservative')

        # prevent Tor Browser running it's own Tor process
        self.profile.set_preference('extensions.torlauncher.start_tor', False)
        self.profile.set_preference(
            'extensions.torbutton.versioncheck_enabled', False)
        self.profile.set_preference('permissions.memory_only', False)
        self.profile.update_preferences()

        # Initialize Tor Browser's binary
        self.binary = self.get_tbb_binary(tbb_version=self.tbb_version,
                                          logfile=tbb_logfile_path)

        # Initialize capabilities
        self.capabilities = DesiredCapabilities.FIREFOX
        self.capabilities.update({
            'handlesAlerts': True,
            'databaseEnabled': True,
            'javascriptEnabled': True,
            'browserConnectionEnabled': True
        })

        try:
            super(TorBrowserDriver,
                  self).__init__(firefox_profile=self.profile,
                                 firefox_binary=self.binary,
                                 capabilities=self.capabilities)
            self.is_running = True
        except WebDriverException as error:
            wl_log.error(
                'WebDriverException while connecting to Webdriver %s' % error)
        except socket.error as skterr:
            wl_log.error('Error connecting to Webdriver', exc_info=True)
            wl_log.error(skterr)
        except Exception as e:
            wl_log.error('Error connecting to Webdriver: %s' % e,
                         exc_info=True)
    def __init__(self,
                 tbb_binary_path=None,
                 tbb_profile_dir=None,
                 tbb_logfile_path=None,
                 tbb_version=cm.TBB_DEFAULT_VERSION,
                 page_url="",
                 capture_screen=True):
        #self.session_id = None
        self.is_running = False
        self.tbb_version = tbb_version
        self.export_lib_path()
        # Initialize Tor Browser's profile
        self.page_url = page_url
        self.capture_screen = capture_screen
        self.profile = self.init_tbb_profile(tbb_version)
        # set homepage to a blank tab
        self.profile.set_preference('browser.startup.page', "0")
        self.profile.set_preference('browser.startup.homepage', 'about:newtab')

        # configure Firefox to use Tor SOCKS proxy
        self.profile.set_preference('network.proxy.type', 1)
        self.profile.set_preference('network.proxy.socks', '127.0.0.1')
        self.profile.set_preference('network.proxy.socks_port', cm.SOCKS_PORT)
        if cm.DISABLE_RANDOMIZEDPIPELINENING:
            self.profile.set_preference(
                'network.http.pipelining.max-optimistic-requests', 5000)
            self.profile.set_preference('network.http.pipelining.maxrequests',
                                        15000)
            self.profile.set_preference('network.http.pipelining', False)

        self.profile.set_preference('extensions.torlauncher.prompt_at_startup',
                                    0)

        # Disable cache - Wang & Goldberg's setting
        self.profile.set_preference('network.http.use-cache', False)

        # http://www.w3.org/TR/webdriver/#page-load-strategies-1
        # wait for all frames to load and make sure there's no
        # outstanding http requests (except AJAX)
        # https://code.google.com/p/selenium/wiki/DesiredCapabilities
        self.profile.set_preference('webdriver.load.strategy', 'conservative')
        # Note that W3C doesn't mention "conservative", this may change in the
        # upcoming versions of the Firefox Webdriver
        # https://w3c.github.io/webdriver/webdriver-spec.html#the-page-load-strategy

        # prevent Tor Browser running it's own Tor process
        self.profile.set_preference('extensions.torlauncher.start_tor', False)
        self.profile.set_preference(
            'extensions.torbutton.versioncheck_enabled', False)
        self.profile.set_preference('permissions.memory_only', False)
        self.profile.update_preferences()
        # Initialize Tor Browser's binary
        self.binary = self.get_tbb_binary(tbb_version=self.tbb_version,
                                          logfile=tbb_logfile_path)

        # Initialize capabilities
        self.capabilities = DesiredCapabilities.PHANTOMJS
        self.capabilities.update({
            'handlesAlerts': True,
            'databaseEnabled': True,
            'browserConnectionEnabled': True,
            'javascriptEnabled': True
        })
        #                        'javascriptEnabled': True}) #,'handlesAlerts': True,'databaseEnabled': True, 'browserConnectionEnabled': True
        service_args = [
            '--proxy=127.0.0.1:%s' % (cm.SOCKS_PORT),
            '--proxy-type=socks5',
        ]

        try:
            super(TorBrowserDriver, self)\
                .__init__(executable_path="/usr/bin/phantomjs",
                          desired_capabilities=self.capabilities, service_args=service_args)
            self.is_running = True
        except WebDriverException as error:
            wl_log.error(
                "WebDriverException while connecting to Webdriver %s" % error)
        except socket.error as skterr:
            wl_log.error("Error connecting to Webdriver", exc_info=True)
            wl_log.error(skterr.message)
        except Exception as e:
            wl_log.error("Error connecting to Webdriver: %s" % e,
                         exc_info=True)
Beispiel #26
0
    if verbose:
        wl_log.setLevel(logging.DEBUG)
    else:
        wl_log.setLevel(logging.INFO)

    # Validate the given arguments
    # Read urls
    url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str)
    url_list = url_list.tolist()
    url_list = url_list[start_line - 1:stop_line]
    torrc_dict = cm.TORRC_DEFAULT

    if not tbb_version:
        tbb_version = cm.TBB_DEFAULT_VERSION
    elif tbb_version not in cm.TBB_KNOWN_VERSIONS:
        ut.die('Version of Tor browser is not recognized.')

    crawler = Crawler(torrc_dict, url_list, tbb_version, xvfb, capture_screen)
    wl_log.info('Command line parameters: %s' % sys.argv)

    # Run the crawl
    try:
        crawler.crawl(no_of_batches,
                      no_of_instances,
                      start_line=start_line - 1)
    except KeyboardInterrupt:
        wl_log.warning('Keyboard interrupt! Quitting...')
    except Exception as e:
        wl_log.error('Exception: \n%s' % (traceback.format_exc()))
    finally:
        crawler.stop_crawl()
 def set_page_load_timeout(self):
     try:
         self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT)
     except WebDriverException as seto_exc:
         wl_log.error("Setting soft timeout %s", seto_exc)