def add_canvas_permission(self):
        """Create a permission db (permissions.sqlite) and add

        exception for the canvas image extraction. Otherwise screenshots
        taken by Selenium will be just blank images due to canvas
        fingerprinting defense in TBB."""

        connect_to_db = sqlite3.connect  # @UndefinedVariable
        perm_db = connect_to_db(
            os.path.join(self.prof_dir_path, "permissions.sqlite"))
        cursor = perm_db.cursor()
        # http://mxr.mozilla.org/mozilla-esr31/source/build/automation.py.in
        cursor.execute("PRAGMA user_version=3")
        cursor.execute("""CREATE TABLE IF NOT EXISTS moz_hosts (
          id INTEGER PRIMARY KEY,
          host TEXT,
          type TEXT,
          permission INTEGER,
          expireType INTEGER,
          expireTime INTEGER,
          appId INTEGER,
          isInBrowserElement INTEGER)""")

        domain = get_tld(self.page_url)
        wl_log.debug("Adding canvas/extractData permission for %s" % domain)
        qry = """INSERT INTO 'moz_hosts'
        VALUES(NULL,'%s','canvas/extractData',1,0,0,0,0);""" % domain
        cursor.execute(qry)
        perm_db.commit()
        cursor.close()
    def launch_tor_service(self, logfile='/dev/null'):
        """Launch Tor service and return the process."""
        self.log_file = logfile
        self.tmp_tor_data_dir = ut.clone_dir_with_timestap(
            cm.get_tor_data_path(self.tbb_version))

        self.torrc_dict.update({
            'DataDirectory': self.tmp_tor_data_dir,
            'Log': ['INFO file %s' % logfile]
        })

        wl_log.debug("Tor config: %s" % self.torrc_dict)
        try:
            self.tor_process = stem.process.launch_tor_with_config(
                config=self.torrc_dict,
                init_msg_handler=self.tor_log_handler,
                tor_cmd=cm.get_tor_bin_path(self.tbb_version),
                timeout=270)
            self.controller = Controller.from_port()
            self.controller.authenticate()
            return self.tor_process

        except stem.SocketError as exc:
            wl_log.critical("Unable to connect to tor on port %s: %s" %
                            (cm.SOCKS_PORT, exc))
            sys.exit(1)
        except:
            # most of the time this is due to another instance of
            # tor running on the system
            wl_log.critical("Error launching Tor", exc_info=True)
            sys.exit(1)

        wl_log.info("Tor running at port {0} & controller port {1}.".format(
            cm.SOCKS_PORT, cm.CONTROLLER_PORT))
        return self.tor_process
    def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if cm.running_in_CI:
            wl_log.debug("CI run: will not run dumpcap")
            return False
        if pcap_filter:
            self.set_capture_filter(pcap_filter)

        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        if cm.running_in_CI:
            prefix = "sudo "  # run as sudo in Travis CI since we cannot setcap
        command = '{}dumpcap -P -a duration:{} -a filesize:{} -i eth0 -s 0 -f \'{}\' -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE, shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise cm.DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
Example #4
0
    def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if pcap_filter:
            self.set_capture_filter(pcap_filter)
        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        # 修改eth0为本地测试接口WLAN 写成配置文件
        command = '{}dumpcap -P -a duration:{} -a filesize:{} -i {} -s 0 -f \"{}\" -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.netif, self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
    def start_capture(self, pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if cm.running_in_CI:
            wl_log.debug("CI run: will not run dumpcap")
            return False
        if pcap_filter:
            self.set_capture_filter(pcap_filter)

        if pcap_path:
            self.set_pcap_path(pcap_path)
        prefix = ""
        if cm.running_in_CI:
            prefix = "sudo "  # run as sudo in Travis CI since we cannot setcap
        command = '{}dumpcap -a duration:{} -a filesize:{} -i any -s 0 -f \'{}\' -w {}'\
            .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE,
                    self.pcap_filter, self.pcap_file)
        wl_log.info(command)
        self.p0 = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        timeout = DUMPCAP_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_dumpcap_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise cm.DumpcapTimeoutError()
        else:
            wl_log.debug("dumpcap started in %s seconds" %
                         (DUMPCAP_START_TIMEOUT - timeout))

        self.is_recording = True
def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--url-file', required=True,
                        help='Path to the file that contains the list of URLs to crawl.',
                        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t', '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.CRAWL_DIR)
    parser.add_argument('-c', '--config',
                        help="Crawler tor driver and controller configurations.",
                        choices=config.sections(),
                        default="default")
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)

    # Crawler features
    parser.add_argument('-x', '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s', '--screenshots', action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Select URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Select URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Change results dir if output
    cm.CRAWL_DIR = args.output
    del args.output

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config
Example #7
0
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
 def post_visit(self):
     guard_ips = set([ip for ip in self.controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the consensus.", len(guard_ips))
     wl_log.info("Filtering packets without a guard IP.")
     try:
         ut.filter_pcap(self.job.pcap_file, guard_ips)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s.", e)
         wl_log.error("Check pcap: %s", self.job.pcap_file)
Example #9
0
def parse_arguments():
    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--url-list', required=True,
                        help='Path to the fail that contains the list of URLs to crawl.')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.RESULTS_DIR)
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_PATH)
    parser.add_argument("-e", "--experiment", choices=cm.EXP_TYPES,
                        help="Specifies the crawling methodology.",
                        default=cm.EXP_TYPE_WANG_AND_GOLDBERG)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)

    # For understanding batch and instance parameters please refer
    # to Wang and Goldberg's WPES'13 paper, Section 4.1.4
    parser.add_argument('--batches', type=int,
                        help='Number of batches in the crawl (default: %s)' % cm.NUM_BATCHES,
                        default=cm.NUM_BATCHES)
    parser.add_argument('--instances', type=int,
                        help='Number of instances to crawl for each web page (default: %s)' % cm.NUM_INSTANCES,
                        default=cm.NUM_INSTANCES)

    # Crawler features
    parser.add_argument('-x', '--xvfb', action='store_true',
                        help='Use XVFB (for headless testing)',
                        default=False)
    parser.add_argument('-c', '--capture-screen', action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Start crawling URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Stop crawling URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    wl_log.debug("Command line parameters: %s" % argv)

    return args
Example #10
0
 def take_screenshot(self):
     try:
         out_png = os.path.join(self.visit_dir, 'screenshot.png')
         wl_log.info("Taking screenshot of %s to %s" %
                     (self.page_url, out_png))
         self.tb_driver.get_screenshot_as_file(out_png)
         if cm.running_in_CI:
             wl_log.debug("Screenshot data:image/png;base64,%s" %
                          self.tb_driver.get_screenshot_as_base64())
     except:
         wl_log.info("Exception while taking screenshot of: %s" %
                     self.page_url)
Example #11
0
 def take_screenshot(self):
     try:
         out_png = os.path.join(self.visit_dir, 'screenshot.png')
         wl_log.info("Taking screenshot of %s to %s" % (self.page_url,
                                                        out_png))
         self.tb_driver.get_screenshot_as_file(out_png)
         if cm.running_in_CI:
             wl_log.debug("Screenshot data:image/png;base64,%s"
                          % self.tb_driver.get_screenshot_as_base64())
     except:
         wl_log.info("Exception while taking screenshot of: %s"
                     % self.page_url)
Example #12
0
def pack_crawl_data(crawl_dir):
    """Compress the crawl dir into a tar archive."""
    if not os.path.isdir(crawl_dir):
        wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir)
        return False
    if crawl_dir.endswith(os.path.sep):
        crawl_dir = crawl_dir[:-1]
    crawl_name = os.path.basename(crawl_dir)
    containing_dir = os.path.dirname(crawl_dir)
    os.chdir(containing_dir)
    arc_path = "%s.tar.gz" % crawl_name
    tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name)
    wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or is_targz_archive_corrupt(arc_path):
        wl_log.critical("Tar command failed or archive is corrupt:\
                         %s \nSt: %s txt: %s" % (tar_cmd, status, txt))
        return False
    else:
        return True
Example #13
0
def pack_crawl_data(crawl_dir):
    """Compress the crawl dir into a tar archive."""
    if not os.path.isdir(crawl_dir):
        wl_log.critical("Cannot find the crawl dir: %s" % crawl_dir)
        return False
    if crawl_dir.endswith(os.path.sep):
        crawl_dir = crawl_dir[:-1]
    crawl_name = os.path.basename(crawl_dir)
    containing_dir = os.path.dirname(crawl_dir)
    os.chdir(containing_dir)
    arc_path = "%s.tar.gz" % crawl_name
    tar_cmd = "tar czvf %s %s" % (arc_path, crawl_name)
    wl_log.debug("Packing the crawl dir with cmd: %s" % tar_cmd)
    status, txt = commands.getstatusoutput(tar_cmd)
    if status or is_targz_archive_corrupt(arc_path):
        wl_log.critical("Tar command failed or archive is corrupt:\
                         %s \nSt: %s txt: %s" % (tar_cmd, status, txt))
        return False
    else:
        return True
Example #14
0
 def filter_guards_from_pcap(self):
     guard_ips = set([ip for ip in self.tor_controller.get_all_guard_ips()])
     wl_log.debug("Found %s guards in the concensus.", len(guard_ips))
     orig_pcap = self.pcap_path + ".original"
     copyfile(self.pcap_path, orig_pcap)
     try:
         preader = PcapReader(orig_pcap)
         pcap_filtered = []
         for p in preader:
             if IP not in p:
                 pcap_filtered.append(p)
                 continue
             ip = p.payload
             if ip.dst in guard_ips or ip.src in guard_ips:
                 pcap_filtered.append(p)
         wrpcap(self.pcap_path, pcap_filtered)
     except Exception as e:
         wl_log.error("ERROR: filtering pcap file: %s. Check old pcap: %s",
                      e, orig_pcap)
     else:
         os.remove(orig_pcap)
    def start_capture(self, device='', pcap_path=None, pcap_filter=""):
        """Start capture. Configure sniffer if arguments are given."""
        if pcap_filter:
            self.set_capture_filter(pcap_filter)
        if pcap_path:
            self.set_pcap_path(pcap_path)
        if device:
            self.device = device
        prefix = ""
        command = (
            "{}tshark -nn -T fields -E separator=, -e frame.time_epoch"
            " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport"
            " -e ip.proto -e ip.len -e ip.hdr_len -e tcp.hdr_len -e data.len"
            " -e tcp.flags -e tcp.seq -e tcp.ack"
            " -e tcp.window_size_value -e _ws.expert.message "
            " -a duration:{} -a filesize:{} -s 0 -i {} -f \'{}\'"
            " -w {} > {}".format(prefix, cm.SOFT_VISIT_TIMEOUT,
                                 cm.MAX_DUMP_SIZE, self.device,
                                 self.pcap_filter, self.pcap_file,
                                 '%s.tshark' % self.pcap_file[:-5]))
        wl_log.info(command)
        self.p0 = subprocess.Popen(command,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   shell=True)
        timeout = SNIFFER_START_TIMEOUT  # in seconds
        while timeout > 0 and not self.is_running():
            time.sleep(0.1)
            timeout -= 0.1
        if timeout < 0:
            raise SnifferTimeoutError()
        else:
            wl_log.debug("tshark started in %s seconds" %
                         (SNIFFER_START_TIMEOUT - timeout))

        self.is_recording = True
 def close_all_streams(self):
     """Close all streams of a controller."""
     wl_log.debug("Closing all streams")
     try:
         ut.timeout(cm.STREAM_CLOSE_TIMEOUT)
         for stream in self.controller.get_streams():
             wl_log.debug(
                 "Closing stream %s %s %s " %
                 (stream.id, stream.purpose, stream.target_address))
             self.controller.close_stream(stream.id)  # MISC reason
     except ut.TimeExceededError:
         wl_log.critical("Closing streams timed out!")
     except:
         wl_log.debug("Exception closing stream")
     finally:
         ut.cancel_timeout()
Example #17
0
def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument(
        '-u',
        '--url-file',
        required=True,
        help='Path to the file that contains the list of URLs to crawl.',
        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t',
                        '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument(
        '-o',
        '--output',
        help='Directory to dump the results (default=./results).',
        default=cm.CRAWL_DIR)
    parser.add_argument(
        '-c',
        '--config',
        help="Crawler tor driver and controller configurations.",
        choices=config.sections(),
        default="default")
    parser.add_argument('-b',
                        '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='increase output verbosity',
                        default=False)

    # Crawler features
    parser.add_argument('-x',
                        '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s',
                        '--screenshots',
                        action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument(
        '--start',
        type=int,
        help='Select URLs from this line number: (default: 1).',
        default=1)
    parser.add_argument(
        '--stop',
        type=int,
        help='Select URLs after this line number: (default: EOF).',
        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Change results dir if output
    cm.CRAWL_DIR = args.output
    del args.output

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config
def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--urls', required=True,
                        help='Path to the file that contains the list of URLs to crawl,'
                             ' or a comma-separated list of URLs.',
                        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t', '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.CRAWL_DIR)
    parser.add_argument('-i', '--crawl-id',
                        help='String used as crawl ID (default=DATE).',
                        default=None)
    parser.add_argument('-e', '--addons_dir',
                        help='Directory with the add-ons to be installed (default=None).',
                        default=None)
    parser.add_argument('-c', '--config',
                        help="Crawler tor driver and controller configurations.",
                        choices=config.sections(),
                        default="default")
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-f', '--tor-binary-path',
                        help="Path to the Tor binary.")
    parser.add_argument('-g', '--tor-data-path',
                        help="Path to the Tor data directory.")
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)
    parser.add_argument('-r', '--recover-file',
                        help="File with checkpoint to recover from.",
                        default=None)

    # Crawler features
    parser.add_argument('-x', '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s', '--screenshots', action='store_true',
                        help='Capture page screenshots',
                        default=False)
    parser.add_argument('-d', '--device',
                        help='Interface to sniff the network traffic',
                        choices=cm.IFACES,
                        default='eth0')

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Select URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Select URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Set crawl ID
    if args.crawl_id:
        cm.set_crawl_id(args.crawl_id)
    del args.crawl_id

    # Change results dir if output
    cm.CRAWL_DIR = abspath(args.output)
    cm.LOGS_DIR = join(cm.CRAWL_DIR, 'logs')
    cm.CRAWL_LOG_FILENAME = join(cm.LOGS_DIR, 'crawl.log')
    cm.TOR_LOG_FILENAME = join(cm.LOGS_DIR, 'tor.log')

    if args.recover_file is not None:
        if isfile(cm.CRAWL_LOG_FILENAME):
            move(cm.CRAWL_LOG_FILENAME, cm.CRAWL_LOG_FILENAME + '.' + cm.CRAWL_ID)
        if isfile(cm.TOR_LOG_FILENAME):
            move(cm.TOR_LOG_FILENAME, cm.TOR_LOG_FILENAME + '.' + cm.CRAWL_ID)

    del args.output

    # Set local IP
    addresses = ifaddresses(args.device)
    ips = addresses.setdefault(AF_INET, [{'addr': 'No IP'}])
    cm.LOCAL_IP = ips[0]['addr']

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config