def run():
    # Parse arguments
    args, config = parse_arguments()

    # build dirs
    build_crawl_dirs(args.url_file)

    # Read URLs
    url_list = parse_url_list(args.url_file, args.start, args.stop)

    # Configure logger
    add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG)

    # Configure controller
    torrc_config = ut.get_dict_subconfig(config, args.config, "torrc")
    controller = TorController(cm.TBB_DIR,
                               torrc_dict=torrc_config,
                               pollute=False)

    # Configure browser
    ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref")
    driver = TorBrowserWrapper(cm.TBB_DIR,
                               tbb_logfile_path=cm.DEFAULT_FF_LOG,
                               tor_cfg=USE_RUNNING_TOR,
                               pref_dict=ffprefs,
                               socks_port=int(torrc_config['socksport']))

    # Instantiate crawler
    crawler = crawler_mod.Crawler(driver, controller, args.screenshots,
                                  args.device)

    # Configure crawl
    job_config = ut.get_dict_subconfig(config, args.config, "job")
    job = crawler_mod.CrawlJob(job_config, url_list)

    # Setup stem headless display
    if args.virtual_display:
        xvfb_h = int(args.virtual_display.split('x')[0])
        xvfb_w = int(args.virtual_display.split('x')[1])
    else:
        xvfb_h = cm.DEFAULT_XVFB_WIN_H
        xvfb_w = cm.DEFAULT_XVFB_WIN_W
    xvfb_display = start_xvfb(xvfb_w, xvfb_h)

    # Run the crawl
    chdir(cm.CRAWL_DIR)
    try:
        crawler.crawl(job)
    except KeyboardInterrupt:
        wl_log.warning("Keyboard interrupt! Quitting...")
        sys.exit(-1)
    finally:
        # Post crawl
        post_crawl()

        # Close display
        stop_xvfb(xvfb_display)

    # die
    sys.exit(0)
Beispiel #2
0
 def stop_capture(self):
     """Kill the dumpcap process."""
     ut.kill_all_children(self.p0.pid)  # self.p0.pid is the shell pid
     self.p0.kill()
     self.is_recording = False
     if os.path.isfile(self.pcap_file):
         wl_log.info('Capture killed. Traffic size: %s Bytes %s' %
                     (os.path.getsize(self.pcap_file), self.pcap_file))
     else:
         wl_log.warning('Capture killed but cannot find capture file: %s' %
                        self.pcap_file)
         wl_log.warning('Check %s for error information!' % self.log)
 def _do_batch(self):
     """
     Must init/restart the Tor process to have a different circuit.
     If the controller is configured to not pollute the profile, each
     restart forces to switch the entry guard.
     """
     with self.controller.launch():
         for self.job.site in xrange(len(self.job.urls)):
             if len(self.job.url) > cm.MAX_FNAME_LENGTH:
                 wl_log.warning("URL is too long: %s" % self.job.url)
                 continue
             self._do_instance()
             sleep(float(self.job.config['pause_between_videos']))