def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( '--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument( '--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help= 'Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument( '--port', type=int, help= 'Base port number, server will bind to 6 ports starting from base. Default is 5550' ) args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
def __init__(self): settings = Settings() settings.set('SPIDER_FEED_PARTITIONS', 1) settings.set('QUEUE_HOSTNAME_PARTITIONING', True) self.mb = MessageBus(settings) sl = self.mb.spider_log() # sw self.sw_sl_c = sl.consumer(partition_id=0, type='sw') us = self.mb.scoring_log() self.sw_us_p = us.producer() sleep(0.1) # db self.db_sl_c = sl.consumer(partition_id=None, type='db') self.db_us_c = us.consumer() sf = self.mb.spider_feed() self.db_sf_p = sf.producer() sleep(0.1) # spider self.sp_sl_p = sl.producer() self.sp_sf_c = sf.consumer(0) sleep(0.1)
def __init__(self, manager): self._manager = manager settings = Settings(attributes=manager.settings.attributes) messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = settings.get('SPIDER_PARTITION_ID') self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def main(): parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument('--config', type=str, help='Settings module name, should be accessible by import.') parser.add_argument('--hostname', type=str, help='Hostname or IP address to bind. Default is 127.0.0.1') parser.add_argument('--log-level', '-L', type=str, default='INFO', help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.') parser.add_argument('--port', type=int, help='Base port number, server will bind to 6 ports starting from base. Default is 5550') args = parser.parse_args() settings = Settings(module=args.config) hostname = args.hostname if args.hostname else settings.get("ZMQ_HOSTNAME") port = args.port if args.port else settings.get("ZMQ_BASE_PORT") server = Server(hostname, port) server.logger.setLevel(args.log_level) server.start()
help='Disables periodical generation of new batches.') parser.add_argument('--no-incoming', action='store_true', help='Disables periodical incoming topic consumption.') parser.add_argument( '--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() logger.setLevel(args.log_level) logger.addHandler(CONSOLE) settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) worker = FrontierWorker(settings, args.no_batches, args.no_incoming) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run()
def test_override(): s = Settings() assert s.get("SPIDER_FEED_PARTITIONS") == 2
def test_frontera(): s = Settings() assert s.get("TEST_MODE") is not None assert s.get("MAX_REQUESTS") is not None
self.stats.setdefault('batches_after_start', 0) self.stats['batches_after_start'] += 1 self.stats['last_batch_generated'] = asctime() return count if __name__ == '__main__': parser = ArgumentParser(description="Frontera DB worker.") parser.add_argument('--no-batches', action='store_true', help='Disables periodical generation of new batches.') parser.add_argument('--no-incoming', action='store_true', help='Disables periodical incoming topic consumption.') parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument('--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") args = parser.parse_args() logger.setLevel(args.log_level) logger.addHandler(CONSOLE) settings = Settings(module=args.config) if args.port: settings.set("JSONRPC_PORT", [args.port]) worker = FrontierWorker(settings, args.no_batches, args.no_incoming) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run()
encoded = self._encoder.encode_update_score( request.meta['fingerprint'], score, request.url, False) return [encoded] return [] if __name__ == '__main__': parser = ArgumentParser(description="Crawl frontier scoring worker.") parser.add_argument( '--config', type=str, required=True, help='Settings module name, should be accessible by import') parser.add_argument( '--log-level', '-L', type=str, default='INFO', help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") parser.add_argument('--strategy', type=str, required=True, help='Crawling strategy module name') args = parser.parse_args() logger.setLevel(args.log_level) settings = Settings(module=args.config) strategy_module = import_module(args.strategy) worker = ScoringWorker(settings, strategy_module) worker.run()
def test_instance_attrs(): s = Settings(attributes={"XYZ": "hey"}) assert s.attributes["XYZ"] == "hey"