Exemple #1
0
 def __init__(self,
              stats_db=None,
              status_callback=None,
              options=warcprox.Options()):
     warcprox.mitmproxy.PooledMitmProxy.__init__(self, options)
     SingleThreadedWarcProxy.__init__(self, stats_db, status_callback,
                                      options)
Exemple #2
0
 def __init__(self, options=warcprox.Options()):
     parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url)
     self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                     db=parsed.database)
     self.table = parsed.table
     self._ensure_db_table()
     self.options = options
Exemple #3
0
 def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()):
     server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001)
     http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True)
     self.ca = ca
     self.playback_index_db = playback_index_db
     self.warcs_dir = options.directory
     self.options = options
Exemple #4
0
    def __init__(self, options=warcprox.Options()):
        """
        Create warcprox controller based on `options`.
        """
        self.options = options

        self.proxy_thread = None
        self.playback_proxy_thread = None
        self._last_rss = None
        self.stop = threading.Event()
        self._start_stop_lock = threading.Lock()

        self.stats_processor = Factory.stats_processor(self.options)

        self.proxy = warcprox.warcproxy.WarcProxy(self.stats_processor,
                                                  self.postfetch_status,
                                                  options)
        self.playback_proxy = Factory.playback_proxy(self.proxy.ca,
                                                     self.options)

        # https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads
        if not self.options.writer_threads:
            self.options.writer_threads = 1

        self.build_postfetch_chain(self.proxy.recorded_url_q)

        self.service_registry = Factory.service_registry(options)
Exemple #5
0
 def __init__(self, cdx_dedup, options=warcprox.Options()):
     warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
     DedupableMixin.__init__(self, options)
     self.pool = futures.ThreadPoolExecutor(
         max_workers=options.cdxserver_dedup_max_threads)
     self.batch = set()
     self.cdx_dedup = cdx_dedup
Exemple #6
0
    def __init__(self, options=warcprox.Options()):
        """
        Create warcprox controller based on `options`.
        """
        self.options = options

        self.proxy_thread = None
        self.playback_proxy_thread = None
        self._last_rss = None
        self.stop = threading.Event()
        self._start_stop_lock = threading.Lock()

        self.stats_processor = Factory.stats_processor(self.options)

        self.proxy = warcprox.warcproxy.WarcProxy(
                self.stats_processor, self.postfetch_status, options)
        self.playback_proxy = Factory.playback_proxy(
            self.proxy.ca, self.options)

        # default number of warc writer threads = sqrt(proxy.max_threads)
        # pulled out of thin air because it strikes me as reasonable
        # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
        if not self.options.writer_threads:
            self.options.writer_threads = int(self.proxy.max_threads ** 0.5)

        self.build_postfetch_chain(self.proxy.recorded_url_q)

        self.service_registry = Factory.service_registry(options)
 def __init__(self, options=warcprox.Options()):
     warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
     self.writer_pool = warcprox.writer.WarcWriterPool(options)
     self.method_filter = set(
         method.upper() for method in self.options.method_filter or [])
     self.blackout_period = options.blackout_period or 0
     self.close_prefix_reqs = queue.Queue()
Exemple #8
0
 def _warcprox_opts(self, args):
     '''
     Takes args as produced by the argument parser built by
     _build_arg_parser and builds warcprox arguments object suitable to pass
     to warcprox.main.init_controller. Copies some arguments, renames some,
     populates some with defaults appropriate for brozzler-easy, etc.
     '''
     warcprox_opts = warcprox.Options()
     warcprox_opts.address = 'localhost'
     # let the OS choose an available port; discover it later using
     # sock.getsockname()[1]
     warcprox_opts.port = 0
     warcprox_opts.cacert = args.cacert
     warcprox_opts.certs_dir = args.certs_dir
     warcprox_opts.directory = args.warcs_dir
     warcprox_opts.gzip = True
     warcprox_opts.prefix = 'brozzler'
     warcprox_opts.size = 1000 * 1000 * 1000
     warcprox_opts.rollover_idle_time = 3 * 60
     warcprox_opts.digest_algorithm = 'sha1'
     warcprox_opts.base32 = True
     warcprox_opts.stats_db_file = None
     warcprox_opts.playback_port = None
     warcprox_opts.playback_index_db_file = None
     warcprox_opts.rethinkdb_big_table_url = (
         'rethinkdb://%s/%s/captures' %
         (args.rethinkdb_servers, args.rethinkdb_db))
     warcprox_opts.queue_size = 500
     warcprox_opts.max_threads = None
     warcprox_opts.profile = False
     warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy
     return warcprox_opts
Exemple #9
0
    def __init__(self, options=warcprox.Options()):

        self.rollover_size = options.rollover_size or 1000000000
        self.rollover_idle_time = options.rollover_idle_time or None
        self._last_activity = time.time()

        self.gzip = options.gzip or False
        digest_algorithm = options.digest_algorithm or 'sha1'
        base32 = options.base32
        self.record_builder = warcprox.warc.WarcRecordBuilder(
            digest_algorithm=digest_algorithm, base32=base32)

        # warc path and filename stuff
        self.directory = options.directory or './warcs'
        self.prefix = options.prefix or 'warcprox'

        self._f = None
        self._fpath = None
        self._f_finalname = None
        self._serial = 0

        self._randomtoken = "".join(random.Random().sample(
            string.digits + string.ascii_lowercase, 8))

        if not os.path.exists(self.directory):
            self.logger.info(
                "warc destination directory {} doesn't exist, creating it".
                format(self.directory))
            os.mkdir(self.directory)
def init_proxy(args):
    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir,
        ca_name=ca_name)
    options = warcprox.Options(**vars(args))
    proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca,
        recorded_url_q=FakeQueue(), options=options)
    return proxy
Exemple #11
0
    def __init__(self, options=warcprox.Options()):
        PooledMixIn.__init__(self, options.max_threads)
        self.profilers = collections.defaultdict(cProfile.Profile)

        if options.profile:
            self.process_request_thread = self._profile_process_request_thread
        else:
            self.process_request_thread = self._process_request_thread
Exemple #12
0
 def __init__(self, options=warcprox.Options()):
     warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
     self.writer_pool = warcprox.writer.WarcWriterPool(options)
     self.method_filter = set(
         method.upper() for method in self.options.method_filter or [])
     self.pool = futures.ThreadPoolExecutor(
         max_workers=options.writer_threads or 1)
     self.batch = set()
Exemple #13
0
    def __init__(self, max_threads, options=warcprox.Options()):
        PooledMixIn.__init__(self, max_threads)
        self.profilers = {}

        if options.profile:
            self.process_request_thread = self._profile_process_request_thread
        else:
            self.process_request_thread = self._process_request_thread
Exemple #14
0
    def __init__(self, options=warcprox.Options()):
        StatsProcessor.__init__(self, options)

        parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.replicas = min(3, len(self.rr.servers))
Exemple #15
0
 def __init__(
         self, ca=None, recorded_url_q=None, stats_db=None,
         options=warcprox.Options()):
     if options.max_threads:
         self.logger.info(
                 "max_threads=%s set by command line option",
                 options.max_threads)
     warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads)
     SingleThreadedWarcProxy.__init__(
             self, ca, recorded_url_q, stats_db, options)
Exemple #16
0
 def _writer(self, recorded_url):
     w = self.default_warc_writer
     if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta:
         # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
         options = warcprox.Options(**vars(self.options))
         options.prefix = recorded_url.warcprox_meta["warc-prefix"]
         if not options.prefix in self.warc_writers:
             self.warc_writers[options.prefix] = WarcWriter(options)
         w = self.warc_writers[options.prefix]
     return w
Exemple #17
0
    def __init__(self, options=warcprox.Options()):
        PooledMixIn.__init__(self, options.max_threads)
        MitmProxy.__init__(self)
        self.profilers = collections.defaultdict(cProfile.Profile)
        self.shutting_down = False

        if options.profile:
            self.process_request_thread = self._profile_process_request_thread
        else:
            self.process_request_thread = self._process_request_thread
Exemple #18
0
 def __init__(self,
              r,
              table="dedup",
              shards=None,
              replicas=None,
              options=warcprox.Options()):
     self.r = r
     self.table = table
     self.shards = shards or len(r.servers)
     self.replicas = replicas or min(3, len(r.servers))
     self._ensure_db_table()
     self.options = options
Exemple #19
0
 def __init__(self, cdx_url="https://web.archive.org/cdx/search",
              maxsize=200, options=warcprox.Options()):
     """Initialize cdx server connection pool and related parameters.
     Use low timeout value and no retries to avoid blocking warcprox
     operation by a slow CDX server.
     """
     self.cdx_url = cdx_url
     self.options = options
     self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
                                          timeout=2.0)
     if options.cdxserver_dedup_cookies:
         self.cookies = options.cdxserver_dedup_cookies
Exemple #20
0
    def __init__(
            self, stats_db=None, status_callback=None,
            options=warcprox.Options()):
        self.start_time = doublethink.utcnow()

        warcprox.mitmproxy.SingleThreadedMitmProxy.__init__(
                self, WarcProxyHandler, options)

        self.status_callback = status_callback
        self.stats_db = stats_db
        self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000)
        self.running_stats = warcprox.stats.RunningStats()
Exemple #21
0
    def __init__(self, options=warcprox.Options()):
        if options.max_threads:
            self.logger.info('max_threads=%s set by command line option',
                             options.max_threads)

        PooledMixIn.__init__(self, options.max_threads)
        self.profilers = collections.defaultdict(cProfile.Profile)

        if options.profile:
            self.process_request_thread = self._profile_process_request_thread
        else:
            self.process_request_thread = self._process_request_thread
Exemple #22
0
    def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()):
        self.r = rethinker
        self.table = table
        self.shards = shards or 1  # 1 shard by default because it's probably a small table
        self.replicas = replicas or min(3, len(self.r.servers))
        self._ensure_db_table()
        self.options = options

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = {}
        self._timer = None
Exemple #23
0
 def __init__(self, cdx_url="https://web.archive.org/cdx/search",
              maxsize=400, options=warcprox.Options()):
     """Initialize cdx server connection pool and related parameters.
     Use low timeout value and no retries to avoid blocking warcprox
     operation by a slow CDX server.
     """
     self.cdx_url = cdx_url
     self.options = options
     headers = {'User-Agent': 'warcprox', 'Accept-Encoding': 'gzip, deflate'}
     if options.cdxserver_dedup_cookies:
         headers['Cookie'] = options.cdxserver_dedup_cookies
     self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
                                          timeout=2.0, headers=headers)
Exemple #24
0
    def __init__(self, options=warcprox.Options()):
        try:
            import trough.client
        except ImportError as e:
            logging.critical(
                    '%s: %s\n\nYou might need to run "pip install '
                    'warcprox[trough]".', type(e).__name__, e)
            sys.exit(1)

        DedupableMixin.__init__(self, options)
        self.options = options
        self._trough_cli = trough.client.TroughClient(
                options.rethinkdb_trough_db_url, promotion_interval=60*60)
Exemple #25
0
    def __init__(self, options=warcprox.Options()):
        warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
        self.writer_pool = warcprox.writer.WarcWriterPool(options)
        self.method_filter = set(method.upper() for method in self.options.method_filter or [])

        # set max_queued small, because self.inq is already handling queueing
        self.thread_local = threading.local()
        self.thread_profilers = {}
        # for us; but give it a little breathing room to make sure it can keep
        # worker threads busy
        self.pool = warcprox.ThreadPoolExecutor(
                max_workers=options.writer_threads or 1,
                max_queued=10 * (options.writer_threads or 1))
        self.batch = set()
Exemple #26
0
 def __init__(self, options=warcprox.Options(), randomtoken='0'):
     self.f = None
     self.path = None
     self.finalname = None
     self.gzip = options.gzip or False
     self.prefix = options.prefix or 'warcprox'
     self.open_suffix = '' if options.no_warc_open_suffix else '.open'
     self.randomtoken = randomtoken
     self.rollover_size = options.rollover_size or 1000000000
     self.rollover_idle_time = options.rollover_idle_time or None
     self.directory = options.directory or './warcs'
     self.filename_template = options.warc_filename or \
             '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
     self.last_activity = time.time()
Exemple #27
0
    def __init__(self, options=warcprox.Options()):
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_big_table_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.table = parsed.table
        self.options = options
        self._ensure_db_table()

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = []
        self._timer = None
Exemple #28
0
    def __init__(
            self, r, table="captures", shards=None, replicas=None,
            options=warcprox.Options()):
        self.r = r
        self.table = table
        self.shards = shards or len(r.servers)
        self.replicas = replicas or min(3, len(r.servers))
        self.options = options
        self._ensure_db_table()

        self._stop = threading.Event()
        self._batch_lock = threading.RLock()
        with self._batch_lock:
            self._batch = []
        self._timer = None
Exemple #29
0
    def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()):
        try:
            import dbm.gnu as dbm_gnu
        except ImportError:
            try:
                import gdbm as dbm_gnu
            except ImportError:
                import anydbm as dbm_gnu

        if os.path.exists(dbm_file):
            self.logger.info('opening existing stats database {}'.format(dbm_file))
        else:
            self.logger.info('creating new stats database {}'.format(dbm_file))

        self.db = dbm_gnu.open(dbm_file, 'c')
        self.options = options
Exemple #30
0
 def __init__(self,
              ca=None,
              playback_index_db=None,
              options=warcprox.Options()):
     server_address = (options.address or 'localhost', options.playback_port
                       if options.playback_port is not None else 8001)
     http_server.HTTPServer.__init__(self,
                                     server_address,
                                     PlaybackProxyHandler,
                                     bind_and_activate=True)
     self.ca = ca
     self.playback_index_db = playback_index_db
     self.warcs_dir = options.directory
     self.options = options
     self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
     self.bad_hostnames_ports_lock = threading.RLock()