def __init__(self, rethinkdb_trough_db_url, promotion_interval=None): ''' TroughClient constructor Args: rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to trough configuration database promotion_interval: if specified, `TroughClient` will spawn a thread that "promotes" (pushed to hdfs) "dirty" trough segments (segments that have received writes) periodically, sleeping for `promotion_interval` seconds between cycles (default None) ''' parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.svcreg = doublethink.ServiceRegistry(self.rr) self._write_url_cache = {} self._read_url_cache = {} self._dirty_segments = set() self._dirty_segments_lock = threading.RLock() self.promotion_interval = promotion_interval self._promoter_thread = None if promotion_interval: self._promoter_thread = threading.Thread( target=self._promotrix, name='TroughClient-promoter') self._promoter_thread.setDaemon(True) self._promoter_thread.start()
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self._ensure_db_table() self.options = options
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_dedup_url) self.rr = doublethink.Rethinker( servers=parsed.hosts, db=parsed.database) self.table = parsed.table self._ensure_db_table() self.options = options
def __init__(self, rethinkdb_trough_db_url, promotion_interval=None): ''' TroughClient constructor Args: rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to trough configuration database promotion_interval: if specified, `TroughClient` will spawn a thread that "promotes" (pushed to hdfs) "dirty" trough segments (segments that have received writes) periodically, sleeping for `promotion_interval` seconds between cycles (default None) ''' parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url) self.rr = doublethink.Rethinker( servers=parsed.hosts, db=parsed.database) self.svcreg = doublethink.ServiceRegistry(self.rr) self._write_url_cache = {} self._read_url_cache = {} self._dirty_segments = set() self._dirty_segments_lock = threading.RLock() self.promotion_interval = promotion_interval self._promoter_thread = None if promotion_interval: self._promoter_thread = threading.Thread( target=self._promotrix, name='TroughClient-promoter') self._promoter_thread.setDaemon(True) self._promoter_thread.start()
def service_registry(options): if options.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) return doublethink.ServiceRegistry(rr, table=parsed.table) else: return None
def __init__(self, options=warcprox.Options()): StatsProcessor.__init__(self, options) parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker( servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers))
def __init__(self, options=warcprox.Options()): StatsProcessor.__init__(self, options) parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers))
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url) self.rr = doublethink.Rethinker( servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.replicas = min(3, len(self.rr.servers)) self._ensure_db_table() self.options = options self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = {} self._timer = None
def __init__(self, options=warcprox.Options()): parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_big_table_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.table = parsed.table self.options = options self._ensure_db_table() self._stop = threading.Event() self._batch_lock = threading.RLock() with self._batch_lock: self._batch = [] self._timer = None
def ensure_rethinkdb_tables(argv=None): ''' Creates rethinkdb tables if they don't already exist. Warcprox normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument( '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=('rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_stats_table')) group = arg_parser.add_mutually_exclusive_group() group.add_argument( '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=('rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_dedup_table')) group.add_argument( '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=('rethinkdb big table url (table will be populated with ' 'various capture information and is suitable for use as ' 'index for playback), e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/captures')) group.add_argument( '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=('🐷 url pointing to trough configuration rethinkdb database, ' 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015' '/trough_configuration')) arg_parser.add_argument( '--rethinkdb-services-url', dest='rethinkdb_services_url', help=('rethinkdb service registry table url; if provided, warcprox ' 'will create and heartbeat entry for itself')) arg_parser.add_argument('-q', '--quiet', dest='log_level', action='store_const', default=logging.INFO, const=logging.WARN) arg_parser.add_argument('-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG) args = arg_parser.parse_args(args=argv[1:]) logging.basicConfig( stream=sys.stdout, level=args.log_level, format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s' '(%(filename)s:%(lineno)d) %(message)s')) options = warcprox.Options(**vars(args)) did_something = False if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) did_something = True if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsProcessor(options=options) stats_db._ensure_db_table() did_something = True if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) did_something = True if args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) did_something = True if args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) logging.warn( 'trough is responsible for creating most of the rethinkdb ' 'tables that it uses') did_something = True if not did_something: logging.error('nothing to do, no --rethinkdb-* options supplied')
def ensure_rethinkdb_tables(argv=None): ''' Creates rethinkdb tables if they don't already exist. Warcprox normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument( '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=( 'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_stats_table')) group = arg_parser.add_mutually_exclusive_group() group.add_argument( '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=( 'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_dedup_table')) group.add_argument( '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=( 'rethinkdb big table url (table will be populated with ' 'various capture information and is suitable for use as ' 'index for playback), e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/captures')) group.add_argument( '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=( '🐷 url pointing to trough configuration rethinkdb database, ' 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015' '/trough_configuration')) arg_parser.add_argument( '--rethinkdb-services-url', dest='rethinkdb_services_url', help=( 'rethinkdb service registry table url; if provided, warcprox ' 'will create and heartbeat entry for itself')) arg_parser.add_argument( '-q', '--quiet', dest='log_level', action='store_const', default=logging.INFO, const=logging.WARN) arg_parser.add_argument( '-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG) args = arg_parser.parse_args(args=argv[1:]) logging.basicConfig( stream=sys.stdout, level=args.log_level, format=( '%(asctime)s %(levelname)s %(name)s.%(funcName)s' '(%(filename)s:%(lineno)d) %(message)s')) options = warcprox.Options(**vars(args)) did_something = False if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) did_something = True if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsProcessor(options=options) stats_db._ensure_db_table() did_something = True if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) did_something = True if args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) did_something = True if args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) logging.warning( 'trough is responsible for creating most of the rethinkdb ' 'tables that it uses') did_something = True if not did_something: logging.error('nothing to do, no --rethinkdb-* options supplied')
def init_controller(args): ''' Creates a warcprox.controller.WarcproxController configured according to the supplied arguments (normally the result of parse_args(sys.argv)). ''' options = warcprox.Options(**vars(args)) try: hashlib.new(args.digest_algorithm) except Exception as e: logging.fatal(e) exit(1) listeners = [] if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) elif args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) if dedup_db: listeners.append(dedup_db) if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsDb(options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( args.playback_index_db_file, options=options) playback_proxy = warcprox.playback.PlaybackProxy( ca=ca, playback_index_db=playback_index_db, options=options) listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None if args.crawl_log_dir: listeners.append(warcprox.crawl_log.CrawlLogger( args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) listener = class_() listener.notify # make sure it has this method listeners.append(listener) except Exception as e: logging.fatal('problem with plugin class %r: %s', qualname, e) sys.exit(1) writer_pool = warcprox.writer.WarcWriterPool(options=options) # number of warc writer threads = sqrt(proxy.max_threads) # I came up with this out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 num_writer_threads = args.writer_threads or int(proxy.max_threads ** 0.5) logging.debug('initializing %d warc writer threads', num_writer_threads) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread( name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) for i in range(num_writer_threads)] if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) else: svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_threads, playback_proxy, service_registry=svcreg, options=options) return controller
def init_controller(args): ''' Creates a warcprox.controller.WarcproxController configured according to the supplied arguments (normally the result of parse_args(sys.argv)). ''' options = warcprox.Options(**vars(args)) try: hashlib.new(args.digest_algorithm) except Exception as e: logging.fatal(e) exit(1) listeners = [] if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) elif args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) if dedup_db: listeners.append(dedup_db) if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsDb(options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( args.playback_index_db_file, options=options) playback_proxy = warcprox.playback.PlaybackProxy( ca=ca, playback_index_db=playback_index_db, options=options) listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None if args.crawl_log_dir: listeners.append( warcprox.crawl_log.CrawlLogger(args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) listener = class_() listener.notify # make sure it has this method listeners.append(listener) except Exception as e: logging.fatal('problem with plugin class %r: %s', qualname, e) sys.exit(1) writer_pool = warcprox.writer.WarcWriterPool(options=options) # number of warc writer threads = sqrt(proxy.max_threads) # I came up with this out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5) logging.debug('initializing %d warc writer threads', num_writer_threads) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) for i in range(num_writer_threads) ] if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) else: svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_threads, playback_proxy, service_registry=svcreg, options=options) return controller