def __init__(self, stats_db=None, status_callback=None, options=warcprox.Options()): warcprox.mitmproxy.PooledMitmProxy.__init__(self, options) SingleThreadedWarcProxy.__init__(self, stats_db, status_callback, options)
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self._ensure_db_table() self.options = options
def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()): server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001) http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True) self.ca = ca self.playback_index_db = playback_index_db self.warcs_dir = options.directory self.options = options
def __init__(self, options=warcprox.Options()): """ Create warcprox controller based on `options`. """ self.options = options self.proxy_thread = None self.playback_proxy_thread = None self._last_rss = None self.stop = threading.Event() self._start_stop_lock = threading.Lock() self.stats_processor = Factory.stats_processor(self.options) self.proxy = warcprox.warcproxy.WarcProxy(self.stats_processor, self.postfetch_status, options) self.playback_proxy = Factory.playback_proxy(self.proxy.ca, self.options) # https://github.com/internetarchive/warcprox/wiki/benchmarking-number-of-threads if not self.options.writer_threads: self.options.writer_threads = 1 self.build_postfetch_chain(self.proxy.recorded_url_q) self.service_registry = Factory.service_registry(options)
def __init__(self, cdx_dedup, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) DedupableMixin.__init__(self, options) self.pool = futures.ThreadPoolExecutor( max_workers=options.cdxserver_dedup_max_threads) self.batch = set() self.cdx_dedup = cdx_dedup
def __init__(self, options=warcprox.Options()): """ Create warcprox controller based on `options`. """ self.options = options self.proxy_thread = None self.playback_proxy_thread = None self._last_rss = None self.stop = threading.Event() self._start_stop_lock = threading.Lock() self.stats_processor = Factory.stats_processor(self.options) self.proxy = warcprox.warcproxy.WarcProxy( self.stats_processor, self.postfetch_status, options) self.playback_proxy = Factory.playback_proxy( self.proxy.ca, self.options) # default number of warc writer threads = sqrt(proxy.max_threads) # pulled out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 if not self.options.writer_threads: self.options.writer_threads = int(self.proxy.max_threads ** 0.5) self.build_postfetch_chain(self.proxy.recorded_url_q) self.service_registry = Factory.service_registry(options)
def __init__(self, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) self.writer_pool = warcprox.writer.WarcWriterPool(options) self.method_filter = set( method.upper() for method in self.options.method_filter or []) self.blackout_period = options.blackout_period or 0 self.close_prefix_reqs = queue.Queue()
def _warcprox_opts(self, args): ''' Takes args as produced by the argument parser built by _build_arg_parser and builds warcprox arguments object suitable to pass to warcprox.main.init_controller. Copies some arguments, renames some, populates some with defaults appropriate for brozzler-easy, etc. ''' warcprox_opts = warcprox.Options() warcprox_opts.address = 'localhost' # let the OS choose an available port; discover it later using # sock.getsockname()[1] warcprox_opts.port = 0 warcprox_opts.cacert = args.cacert warcprox_opts.certs_dir = args.certs_dir warcprox_opts.directory = args.warcs_dir warcprox_opts.gzip = True warcprox_opts.prefix = 'brozzler' warcprox_opts.size = 1000 * 1000 * 1000 warcprox_opts.rollover_idle_time = 3 * 60 warcprox_opts.digest_algorithm = 'sha1' warcprox_opts.base32 = True warcprox_opts.stats_db_file = None warcprox_opts.playback_port = None warcprox_opts.playback_index_db_file = None warcprox_opts.rethinkdb_big_table_url = ( 'rethinkdb://%s/%s/captures' % (args.rethinkdb_servers, args.rethinkdb_db)) warcprox_opts.queue_size = 500 warcprox_opts.max_threads = None warcprox_opts.profile = False warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy return warcprox_opts
def __init__(self, options=warcprox.Options()): self.rollover_size = options.rollover_size or 1000000000 self.rollover_idle_time = options.rollover_idle_time or None self._last_activity = time.time() self.gzip = options.gzip or False digest_algorithm = options.digest_algorithm or 'sha1' base32 = options.base32 self.record_builder = warcprox.warc.WarcRecordBuilder( digest_algorithm=digest_algorithm, base32=base32) # warc path and filename stuff self.directory = options.directory or './warcs' self.prefix = options.prefix or 'warcprox' self._f = None self._fpath = None self._f_finalname = None self._serial = 0 self._randomtoken = "".join(random.Random().sample( string.digits + string.ascii_lowercase, 8)) if not os.path.exists(self.directory): self.logger.info( "warc destination directory {} doesn't exist, creating it". format(self.directory)) os.mkdir(self.directory)
def init_proxy(args): ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) options = warcprox.Options(**vars(args)) proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca, recorded_url_q=FakeQueue(), options=options) return proxy
def __init__(self, options=warcprox.Options()): PooledMixIn.__init__(self, options.max_threads) self.profilers = collections.defaultdict(cProfile.Profile) if options.profile: self.process_request_thread = self._profile_process_request_thread else: self.process_request_thread = self._process_request_thread
def __init__(self, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) self.writer_pool = warcprox.writer.WarcWriterPool(options) self.method_filter = set( method.upper() for method in self.options.method_filter or []) self.pool = futures.ThreadPoolExecutor( max_workers=options.writer_threads or 1) self.batch = set()
def __init__(self, max_threads, options=warcprox.Options()): PooledMixIn.__init__(self, max_threads) self.profilers = {} if options.profile: self.process_request_thread = self._profile_process_request_thread else: self.process_request_thread = self._process_request_thread
def __init__(self, options=warcprox.Options()): StatsProcessor.__init__(self, options) parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers))
def __init__( self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): if options.max_threads: self.logger.info( "max_threads=%s set by command line option", options.max_threads) warcprox.mitmproxy.PooledMitmProxy.__init__(self, options.max_threads) SingleThreadedWarcProxy.__init__( self, ca, recorded_url_q, stats_db, options)
def _writer(self, recorded_url): w = self.default_warc_writer if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta: # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url)) options = warcprox.Options(**vars(self.options)) options.prefix = recorded_url.warcprox_meta["warc-prefix"] if not options.prefix in self.warc_writers: self.warc_writers[options.prefix] = WarcWriter(options) w = self.warc_writers[options.prefix] return w
def __init__(self, options=warcprox.Options()): PooledMixIn.__init__(self, options.max_threads) MitmProxy.__init__(self) self.profilers = collections.defaultdict(cProfile.Profile) self.shutting_down = False if options.profile: self.process_request_thread = self._profile_process_request_thread else: self.process_request_thread = self._process_request_thread
def __init__(self, r, table="dedup", shards=None, replicas=None, options=warcprox.Options()): self.r = r self.table = table self.shards = shards or len(r.servers) self.replicas = replicas or min(3, len(r.servers)) self._ensure_db_table() self.options = options
def __init__(self, cdx_url="https://web.archive.org/cdx/search", maxsize=200, options=warcprox.Options()): """Initialize cdx server connection pool and related parameters. Use low timeout value and no retries to avoid blocking warcprox operation by a slow CDX server. """ self.cdx_url = cdx_url self.options = options self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0, timeout=2.0) if options.cdxserver_dedup_cookies: self.cookies = options.cdxserver_dedup_cookies
def __init__( self, stats_db=None, status_callback=None, options=warcprox.Options()): self.start_time = doublethink.utcnow() warcprox.mitmproxy.SingleThreadedMitmProxy.__init__( self, WarcProxyHandler, options) self.status_callback = status_callback self.stats_db = stats_db self.recorded_url_q = queue.Queue(maxsize=options.queue_size or 1000) self.running_stats = warcprox.stats.RunningStats()
def __init__(self, options=warcprox.Options()): if options.max_threads: self.logger.info('max_threads=%s set by command line option', options.max_threads) PooledMixIn.__init__(self, options.max_threads) self.profilers = collections.defaultdict(cProfile.Profile) if options.profile: self.process_request_thread = self._profile_process_request_thread else: self.process_request_thread = self._process_request_thread
def __init__(self, rethinker, table="stats", shards=None, replicas=None, options=warcprox.Options()): self.r = rethinker self.table = table self.shards = shards or 1 # 1 shard by default because it's probably a small table self.replicas = replicas or min(3, len(self.r.servers)) self._ensure_db_table() self.options = options self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = {} self._timer = None
def __init__(self, cdx_url="https://web.archive.org/cdx/search", maxsize=400, options=warcprox.Options()): """Initialize cdx server connection pool and related parameters. Use low timeout value and no retries to avoid blocking warcprox operation by a slow CDX server. """ self.cdx_url = cdx_url self.options = options headers = {'User-Agent': 'warcprox', 'Accept-Encoding': 'gzip, deflate'} if options.cdxserver_dedup_cookies: headers['Cookie'] = options.cdxserver_dedup_cookies self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0, timeout=2.0, headers=headers)
def __init__(self, options=warcprox.Options()): try: import trough.client except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' 'warcprox[trough]".', type(e).__name__, e) sys.exit(1) DedupableMixin.__init__(self, options) self.options = options self._trough_cli = trough.client.TroughClient( options.rethinkdb_trough_db_url, promotion_interval=60*60)
def __init__(self, options=warcprox.Options()): warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options) self.writer_pool = warcprox.writer.WarcWriterPool(options) self.method_filter = set(method.upper() for method in self.options.method_filter or []) # set max_queued small, because self.inq is already handling queueing self.thread_local = threading.local() self.thread_profilers = {} # for us; but give it a little breathing room to make sure it can keep # worker threads busy self.pool = warcprox.ThreadPoolExecutor( max_workers=options.writer_threads or 1, max_queued=10 * (options.writer_threads or 1)) self.batch = set()
def __init__(self, options=warcprox.Options(), randomtoken='0'): self.f = None self.path = None self.finalname = None self.gzip = options.gzip or False self.prefix = options.prefix or 'warcprox' self.open_suffix = '' if options.no_warc_open_suffix else '.open' self.randomtoken = randomtoken self.rollover_size = options.rollover_size or 1000000000 self.rollover_idle_time = options.rollover_idle_time or None self.directory = options.directory or './warcs' self.filename_template = options.warc_filename or \ '{prefix}-{timestamp17}-{randomtoken}-{serialno}' self.last_activity = time.time()
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_big_table_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.options = options self._ensure_db_table() self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = [] self._timer = None
def __init__( self, r, table="captures", shards=None, replicas=None, options=warcprox.Options()): self.r = r self.table = table self.shards = shards or len(r.servers) self.replicas = replicas or min(3, len(r.servers)) self.options = options self._ensure_db_table() self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = [] self._timer = None
def __init__(self, dbm_file='./warcprox-stats.db', options=warcprox.Options()): try: import dbm.gnu as dbm_gnu except ImportError: try: import gdbm as dbm_gnu except ImportError: import anydbm as dbm_gnu if os.path.exists(dbm_file): self.logger.info('opening existing stats database {}'.format(dbm_file)) else: self.logger.info('creating new stats database {}'.format(dbm_file)) self.db = dbm_gnu.open(dbm_file, 'c') self.options = options
def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()): server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001) http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True) self.ca = ca self.playback_index_db = playback_index_db self.warcs_dir = options.directory self.options = options self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60) self.bad_hostnames_ports_lock = threading.RLock()