class FakeAsyncConfig(object): """Fake class for the configuration of service addresses.""" core_services = { ServiceCoord("Service", 0): Address("0.0.0.0", 0), ServiceCoord("Service", 1): Address("0.0.0.1", 1), } other_services = {}
def __init__(self, shard, contest_id=None): super(EvaluationService, self).__init__(shard) self.contest_id = contest_id # This lock is used to avoid inserting in the queue (which # itself is already thread-safe) an operation which is already # being processed. Such operation might be in one of the # following state: # 1. in the queue; # 2. extracted from the queue by the executor, but not yet # dispatched to a worker; # 3. being processed by a worker ("in the worker pool"); # 4. being processed by action_finished, but with the results # not yet written to the database. # 5. with results written in the database. # # The methods enqueuing operations already check that the # operation is not in state 5, and enqueue() checks that it is # not in the first three states. # # Therefore, the lock guarantees that the methods adding # operations to the queue (_missing_operations, # invalidate_submission, enqueue) are not executed # concurrently with action_finished to avoid picking # operations in state 4. self.post_finish_lock = gevent.lock.RLock() self.queue_service = self.connect_to(ServiceCoord("QueueService", 0)) self.scoring_service = self.connect_to( ServiceCoord("ScoringService", 0))
def __init__(self, shard, contest): parameters = { "login_url": "/", "template_path": pkg_resources.resource_filename("cms.server.contest", "templates"), "static_files": [("cms.server", "static"), ("cms.server.contest", "static")], "cookie_secret": base64.b64encode(config.secret_key), "debug": config.tornado_debug, "is_proxy_used": config.is_proxy_used, } try: listen_address = config.contest_listen_address[shard] listen_port = config.contest_listen_port[shard] except IndexError: raise ConfigError("Wrong shard number for %s, or missing " "address/port configuration. Please check " "contest_listen_address and contest_listen_port " "in cms.conf." % __name__) super(ContestWebServer, self).__init__(listen_port, HANDLERS, parameters, shard=shard, listen_address=listen_address) self.contest = contest # This is a dictionary (indexed by username) of pending # notification. Things like "Yay, your submission went # through.", not things like "Your question has been replied", # that are handled by the db. Each username points to a list # of tuples (timestamp, subject, text). self.notifications = {} # Retrieve the available translations. self.langs = { lang_code: wrap_translations_for_tornado(trans) for lang_code, trans in get_translations().iteritems() } self.file_cacher = FileCacher(self) self.evaluation_service = self.connect_to( ServiceCoord("EvaluationService", 0)) self.scoring_service = self.connect_to( ServiceCoord("ScoringService", 0)) ranking_enabled = len(config.rankings) > 0 self.proxy_service = self.connect_to(ServiceCoord("ProxyService", 0), must_be_present=ranking_enabled) printing_enabled = config.printer is not None self.printing_service = self.connect_to( ServiceCoord("PrintingService", 0), must_be_present=printing_enabled)
def get(self, shard=None, contest_id=None): if contest_id is not None: self.contest = self.safe_get_item(Contest, contest_id) contest_address = "/%s" % contest_id else: contest_address = "" if shard is None: shard = "all" self.r_params = self.render_params() self.r_params["resource_shards"] = \ get_service_shards("ResourceService") self.r_params["resource_addresses"] = {} if shard == "all": for i in range(self.r_params["resource_shards"]): self.r_params["resource_addresses"][i] = get_service_address( ServiceCoord("ResourceService", i)).ip else: shard = int(shard) try: address = get_service_address( ServiceCoord("ResourceService", shard)) except KeyError: self.redirect("/resourceslist%s" % contest_address) return self.r_params["resource_addresses"][shard] = address.ip self.render("resources.html", **self.r_params)
def test_success(self): """Test success cases.""" self.assertEqual( get_service_address(ServiceCoord("Service", 0)), Address("0.0.0.0", 0)) self.assertEqual( get_service_address(ServiceCoord("Service", 1)), Address("0.0.0.1", 1))
def test_is_service_proc(self): """Several tests for identifying the command line of a service. """ service = ServiceCoord("Worker", 0) good_command_lines = [ "/usr/bin/python2 cmsWorker 0", "/usr/bin/python2 cmsWorker", "python2 cmsWorker 0 -c 1", "python2 cmsWorker -c 1", "python2 cmsWorker -c 1 0", "/usr/bin/env python2 cmsWorker 0", "/usr/bin/env python2 cmsWorker", "/usr/bin/env python2 cmsWorker 0 -c 1", "/usr/bin/env python2 cmsWorker -c 1", "/usr/bin/env python2 cmsWorker -c 1 0", sys.executable + " cmsWorker", sys.executable + " cmsWorker 0", sys.executable + " cmsWorker 0 -c 1", sys.executable + " cmsWorker -c 1", sys.executable + " cmsWorker -c 1 0", ] bad_command_lines = [ "ps", "less cmsWorker 0", "less /usr/bin/python2 cmsWorker 0", "/usr/bin/python2 cmsWorker 1", "/usr/bin/python2 cmsAdminWebServer 0", ] for cmdline in good_command_lines: self.assertTrue( ResourceService._is_service_proc(service, cmdline.split(" ")), cmdline) for cmdline in bad_command_lines: self.assertFalse( ResourceService._is_service_proc(service, cmdline.split(" ")), cmdline) # Test we do not pick the wrong shard. service = ServiceCoord("Worker", 1) cmdline = sys.executable + " cmsWorker" self.assertFalse( ResourceService._is_service_proc(service, cmdline.split(" ")), cmdline) # Test that an empty command line does not cause problems. self.assertFalse(ResourceService._is_service_proc(service, []), "Empty command line.") # Simulate a service not running on the same machine. service = ServiceCoord("FakeWorker", 0) cmdline = sys.executable + " cmsFakeWorker 0" self.assertFalse( ResourceService._is_service_proc(service, cmdline.split(" ")), cmdline)
def __init__(self, shard): parameters = { "ui_modules": views, "login_url": "/login", "template_path": pkg_resources.resource_filename("cms.server.admin", "templates"), "static_files": [("cms.server", "static"), ("cms.server.admin", "static")], "cookie_secret": base64.b64encode(config.secret_key), "debug": config.tornado_debug, "auth_middleware": AWSAuthMiddleware, "rpc_enabled": True, "rpc_auth": self.is_rpc_authorized, "xsrf_cookies": True, } super(AdminWebServer, self).__init__(config.admin_listen_port, HANDLERS, parameters, shard=shard, listen_address=config.admin_listen_address) # A list of pending notifications. self.notifications = [] self.file_cacher = FileCacher(self) self.evaluation_service = self.connect_to( ServiceCoord("EvaluationService", 0)) self.scoring_service = self.connect_to( ServiceCoord("ScoringService", 0)) ranking_enabled = len(config.rankings) > 0 self.proxy_service = self.connect_to(ServiceCoord("ProxyService", 0), must_be_present=ranking_enabled) self.resource_services = [] for i in range(get_service_shards("ResourceService")): self.resource_services.append( self.connect_to(ServiceCoord("ResourceService", i))) self.logservice = self.connect_to(ServiceCoord("LogService", 0))
def __init__(self, shard, contest_id=None): """If contest_id is not None, we assume the user wants the autorestart feature. """ logger.initialize(ServiceCoord("ResourceService", shard)) Service.__init__(self, shard, custom_logger=logger) self.contest_id = contest_id # _local_store is a dictionary indexed by time in int(epoch) self._local_store = [] # Floating point epoch using for precise measurement of percents self._last_saved_time = time.time() # Starting point for cpu times self._prev_cpu_times = self._get_cpu_times() # Sorted list of ServiceCoord running in the same machine self._local_services = self._find_local_services() # Dict service with bool to mark if we will restart them. self._will_restart = dict( (service, None if self.contest_id is None else True) for service in self._local_services) # Found process associate to the ServiceCoord. self._procs = dict((service, None) for service in self._local_services) # Previous cpu time for each service. self._services_prev_cpu_times = dict( (service, (0.0, 0.0)) for service in self._local_services) # Start finding processes and their cputimes. self._store_resources(store=False) self.add_timeout(self._store_resources, None, 5) if self.contest_id is not None: self._launched_processes = set([]) self.add_timeout(self._restart_services, None, 5, immediately=True)
def wsgi_app(self, environ, start_response): """Execute this instance as a WSGI application. See the PEP for the meaning of parameters. The separation of __call__ and wsgi_app eases the insertion of middlewares. """ urls = self._url_map.bind_to_environ(environ) try: endpoint, args = urls.match() except HTTPException as exc: return exc assert endpoint == "rpc" remote_service = ServiceCoord(args['service'], args['shard']) if remote_service not in self._service.remote_services: return NotFound() if self._auth is not None and not self._auth( args['service'], args['shard'], args['method']): return Forbidden() request = Request(environ) request.encoding_errors = "strict" # TODO Check content_encoding and content_md5. if request.mimetype != "application/json": return UnsupportedMediaType() if request.accept_mimetypes.quality("application/json") <= 0: return NotAcceptable() try: data = json.load(request.stream) except ValueError: return BadRequest() if not self._service.remote_services[remote_service].connected: return ServiceUnavailable() result = self._service.remote_services[remote_service].execute_rpc( args['method'], data) result.wait(timeout=60) response = Response() response.status_code = 200 response.mimetype = "application/json" response.data = json.dumps({ "data": result.value, "error": None if result.successful() else "%s" % result.exception }) return response
def kill_service(self, service): """Restart the service. Note that after calling successfully this method, get_resource could still report the service running untile we call _store_resources again. service (string): format: name,shard. """ logger.info("Killing %s as asked.", service) try: idx = service.rindex(",") except ValueError: logger.error("Unable to decode service string.") name = service[:idx] try: shard = int(service[idx + 1:]) except ValueError: logger.error("Unable to decode service shard.") remote_service = self.connect_to(ServiceCoord(name, shard)) i = 3 while not remote_service.connected and i > 0: gevent.sleep(0.1) i -= 1 result = remote_service.quit(reason="Asked by ResourceService") return result.get()
def test_find_fails(self): service = ServiceCoord("EvaluationService", 0) for c in self.w0_cmdlines: with patch.object(ProcessMatcher, '_get_all_processes') as f: f.return_value = (TestProcessMatcher._get_all_processes_ret( self.bad_cmdlines + [(c, "good")] + self.bad_cmdlines)) self.assertIsNone(self.pm.find(service))
def setUp(self): self.pm = ProcessMatcher() self.w0_cmdlines = [ "/usr/bin/python2 cmsWorker 0", "/usr/bin/python2 cmsWorker", "python2 cmsWorker 0 -c 1", "python2 cmsWorker -c 1", "python2 cmsWorker -c 1 0", "/usr/bin/env python2 cmsWorker 0", "/usr/bin/env python2 cmsWorker", "/usr/bin/env python2 cmsWorker 0 -c 1", "/usr/bin/env python2 cmsWorker -c 1", "/usr/bin/env python2 cmsWorker -c 1 0", sys.executable + " cmsWorker", sys.executable + " cmsWorker 0", sys.executable + " cmsWorker 0 -c 1", sys.executable + " cmsWorker -c 1", sys.executable + " cmsWorker -c 1 0", ] self.bad_cmdlines = [ "ps", "less cmsWorker 0", "less /usr/bin/python2 cmsWorker 0", "/usr/bin/python2 cmsWorker 1", "/usr/bin/python2 cmsAdminWebServer 0", ] self.w0 = ServiceCoord("Worker", 0)
def __init__(self, shard=0, listen_on_address=None): signal.signal(signal.SIGINT, lambda unused_x, unused_y: self.exit()) self.name = self.__class__.__name__ self.shard = shard self._my_coord = ServiceCoord(self.name, self.shard) # Dictionaries of (to be) connected RemoteServiceClients. self.remote_services = {} self.initialize_logging() # We setup the listening address for services which want to # connect with us. try: address = get_service_address(self._my_coord) except KeyError: raise ConfigError("Unable to find address for service %r. " "Is it specified in core_services in cms.conf?" % (self._my_coord,)) logger.info("--- %s %s %s", self.name, listen_on_address, address) if listen_on_address is not None: self.rpc_server = StreamServer( Address(listen_on_address, address.port), self._connection_handler) else: self.rpc_server = StreamServer(address, self._connection_handler) self.backdoor = None
def __init__(self, evaluation_service): """Create the single executor for ES. The executor just delegates work to the worker pool. """ super(EvaluationExecutor, self).__init__(True) self.evaluation_service = evaluation_service self.pool = WorkerPool(self.evaluation_service) # List of QueueItem (ESOperation) we have extracted from the # queue, but not yet finished to execute. self._currently_executing = [] # Lock used to guard the currently executing operations self._current_execution_lock = gevent.lock.RLock() # Whether execute need to drop the currently executing # operation. self._drop_current = False for i in range(get_service_shards("Worker")): worker = ServiceCoord("Worker", i) self.pool.add_worker(worker)
def toggle_autorestart(self, service): """If the service is scheduled for autorestart, disable it, otherwise enable it. service (string): format: name,shard. return (bool/None): current status of will_restart. """ if not self.autorestart: return None # Decode name,shard try: idx = service.rindex(",") except ValueError: logger.error("Unable to decode service string.") name = service[:idx] # ProxyService requires contest_id if self.contest_id is None and name == "ProxyService": return None try: shard = int(service[idx + 1:]) except ValueError: logger.error("Unable to decode service shard.") service = ServiceCoord(name, shard) self._will_restart[service] = not self._will_restart[service] logger.info("Will restart %s,%s is now %s.", service.name, service.shard, self._will_restart[service]) return self._will_restart[service]
def test_background_connect(self, socket_mock): # Patch the connect method of sockets so that it blocks until # we set the done_event (we will do so at the end of the test). connect_mock = socket_mock.return_value.connect done_event = gevent.event.Event() connect_mock.side_effect = lambda _: done_event.wait() # Connect to the RPC server in non-blocking mode and make sure # that we indeed don't block (i.e., take more than 0.001s). with gevent.Timeout(0.001) as timeout: try: client = self.get_client(ServiceCoord("Foo", 0), block=False) except gevent.Timeout as t: if t is not timeout: raise self.fail("Connecting blocks") # As socket.connect() never returned, the RPC client cannot have # connected. self.assertFalse(client.connected) # Unblock the socket's connect method and make sure it actually # got called (otherwise this whole tests is pointless). Also, # yield to other greenlets so that they can be awoken after the # event triggered. done_event.set() gevent.sleep() connect_mock.assert_called_once_with(Address(self.host, self.port))
def setUp(self): self.pm = ProcessMatcher() path = os.path.join(ResourceService.BIN_PATH, "cms") self.w0_cmdlines = [ "/usr/bin/python3 %sWorker 0" % path, "/usr/bin/python3 %sWorker" % path, "python3 %sWorker 0 -c 1" % path, "python3 %sWorker -c 1" % path, "python3 %sWorker -c 1 0" % path, "/usr/bin/env python3 %sWorker 0" % path, "/usr/bin/env python3 %sWorker" % path, "/usr/bin/env python3 %sWorker 0 -c 1" % path, "/usr/bin/env python3 %sWorker -c 1" % path, "/usr/bin/env python3 %sWorker -c 1 0" % path, sys.executable + " %sWorker" % path, sys.executable + " %sWorker 0" % path, sys.executable + " %sWorker 0 -c 1" % path, sys.executable + " %sWorker -c 1" % path, sys.executable + " %sWorker -c 1 0" % path, ] self.bad_cmdlines = [ "ps", "less %sWorker 0" % path, "less /usr/bin/python3 %sWorker 0" % path, "/usr/bin/python3 %sWorker 1" % path, "/usr/bin/python3 %sAdminWebServer 0" % path, ] self.w0 = ServiceCoord("Worker", 0)
def echo_callback(self, data, error=None): """Callback for check. """ current = time.time() logger.debug("Checker.echo_callback") if error is not None: return try: service, time_ = data.split() time_ = float(time_) name, shard = service.split(",") shard = int(shard) service = ServiceCoord(name, shard) if service not in self.waiting_for or current - time_ > 10: logger.warning("Got late reply (%5.3lf s) from %s.", current - time_, service) else: if time_ - self.waiting_for[service] > 0.001: logger.warning("Someone cheated on the timestamp?!") logger.info("Got reply (%5.3lf s) from %s.", current - time_, service) del self.waiting_for[service] except KeyError: logger.error("Echo answer mis-shapen.")
def get(self, service, shard, method): # TODO: still lacking configurable arguments - some of these # should be GET arguments. rid = self.get_argument("__rid") arguments = self.request.arguments del arguments["__rid"] # Tornado gives for every key a list of arguments, we need # only one arguments = dict((k, decode_json(arguments[k][0])) for k in arguments) service = ServiceCoord(service, int(shard)) authorized = self.application.service.authorized_rpc( service, method, arguments) if not authorized: self.write({'status': 'not authorized'}) return if service not in self.application.service.remote_services or \ not self.application.service.remote_services[service].connected: self.write({'status': 'unconnected'}) return self.application.service.__responses[rid] = "wait" self.application.service.remote_services[service].__getattr__(method)(\ callback=WebService._default_callback, plus=rid, **arguments) self.write({'status': 'wait'})
def test_reusability(self): client = self.get_client(ServiceCoord("Foo", 0)) on_connect_handler = Mock() client.add_on_connect_handler(on_connect_handler) on_disconnect_handler = Mock() client.add_on_disconnect_handler(on_disconnect_handler) for i in range(10): self.assertTrue(client.connected) result = client.echo(value=42) result.wait() self.assertTrue(result.successful()) self.assertEqual(result.value, 42) self.assertTrue(client.connected) self.sleep() client.disconnect() self.assertFalse(client.connected) self.sleep() client.connect() client._connection_event.wait() self.assertTrue(client.connected) self.sleep() self.assertEqual(on_connect_handler.call_count, 10) self.assertEqual(on_disconnect_handler.call_count, 10)
def maybe_send_notification(submission_id): """Non-blocking attempt to notify a running ES of the submission""" rs = RemoteServiceClient(ServiceCoord("EvaluationService", 0), auto_retry=10) rs.connect() result = rs.new_submission(submission_id=submission_id) rs.disconnect() print("ID:{}".format(submission_id))
def test_autoreconnect1(self): client = self.get_client(ServiceCoord("Foo", 0), 0.002) self.sleep() self.assertTrue(client.connected) self.disconnect_servers() self.sleep() self.assertTrue(client.connected, "Autoreconnect didn't kick in " "after server disconnected")
def test_double_disconnect_client(self): # Check that asking a non-connected client to disconnect is # harmless (i.e. disconnection is idempotent). client = self.get_client(ServiceCoord("Foo", 0)) client.disconnect() self.sleep() client.disconnect() self.sleep()
def __init__(self, shard): logger.initialize(ServiceCoord("Worker", shard)) Service.__init__(self, shard, custom_logger=logger) self.file_cacher = FileCacher(self) self.task_type = None self.work_lock = threading.Lock() self.session = None
def test_autoreconnect3(self): client = self.get_client(ServiceCoord("Foo", 0), 0.002) self.sleep() self.assertTrue(client.connected) self.disconnect_clients() self.sleep() self.assertFalse(client.connected, "Autoreconnect still active " "after explicit disconnection")
def __init__(self, shard, contest_id): logger.initialize(ServiceCoord("ScoringService", shard)) Service.__init__(self, shard, custom_logger=logger) self.contest_id = contest_id self.scorers = {} self._initialize_scorers() # If for some reason (SS switched off for a while, or broken # connection with ES), submissions have been left without # score, this is the set where you want to pur their ids. Note # that sets != {} if and only if there is an alive timeout for # the method "score_old_submission". self.submission_ids_to_score = set([]) self.submission_ids_to_token = set([]) self.scoring_old_submission = False # We need to load every submission at start, but we don't want # to invalidate every score so that we can simply load the # score-less submissions. So we keep a set of submissions that # we analyzed (for scoring and for tokens). self.submission_ids_scored = set() self.submission_ids_tokened = set() # Initialize ranking web servers we need to send data to. self.rankings = [] for i in xrange(len(config.rankings_address)): address = config.rankings_address[i] username = config.rankings_username[i] password = config.rankings_password[i] self.rankings.append(( address[0], # HTTP / HTTPS "%s:%d" % tuple(address[1:]), get_authorization(username, password))) self.initialize_queue = set() self.submission_queue = dict() self.subchange_queue = dict() self.operation_queue_lock = threading.Lock() for ranking in self.rankings: self.initialize_queue.add(ranking) self.log_bridge = LogBridge() thread = threading.Thread(target=self.dispath_operations_thread, args=(self.log_bridge, )) thread.daemon = True thread.start() self.add_timeout(self.search_jobs_not_done, None, ScoringService.JOBS_NOT_DONE_CHECK_TIME, immediately=True) self.add_timeout(self.forward_logs, None, ScoringService.FORWARD_LOG_TIME, immediately=True)
def test_double_disconnect_server(self): # Check that asking a non-connected server to disconnect is # harmless (i.e. disconnection is idempotent). self.get_client(ServiceCoord("Foo", 0)) self.sleep() self.servers[0].disconnect() self.sleep() self.servers[0].disconnect() self.sleep()
def __init__(self, args): Service.__init__(self, shard=args.shard) self.address = config.get("core", "listen_address") self.port = int(config.get("core", "listen_port")) + args.shard self.file_cacher = FileCacher(self) self.evaluation_service = self.connect_to( ServiceCoord('EvaluationService', 0)) self.wsgi_app = APIHandler(self)
def get(self, contest_id=None): if contest_id is not None: self.contest = self.safe_get_item(Contest, contest_id) self.r_params = self.render_params() self.r_params["resource_addresses"] = {} services = get_service_shards("ResourceService") for i in range(services): self.r_params["resource_addresses"][i] = get_service_address( ServiceCoord("ResourceService", i)).ip self.render("resourceslist.html", **self.r_params)
def __init__(self, shard, contest_id=None): super(QueueService, self).__init__(shard) self.contest_id = contest_id # This lock is used to avoid inserting in the queue (which # itself is already thread-safe) an operation which is already # being processed. Such operation might be in one of the # following state: # 1. in the queue; # 2. extracted from the queue by the executor, but not yet # dispatched to a worker; # 3. being processed by a worker ("in the worker pool"); # 4. being processed by action_finished, but with the results # not yet written to the database. # 5. with results written in the database. # # The methods enqueuing operations already check that the # operation is not in state 5, and enqueue() checks that it is # not in the first three states. # # Therefore, the lock guarantees that the methods adding # operations to the queue (_missing_operations, # invalidate_submission, enqueue) are not executed # concurrently with action_finished to avoid picking # operations in state 4. self.post_finish_lock = gevent.lock.RLock() # Data structure holding pending results. self.pending = PendingResults() # Neverending greenlet consuming results, by sending them to ES. gevent.spawn(self.process_results) self.evaluation_services = [ self.connect_to(ServiceCoord("EvaluationService", i)) for i in range(get_service_shards("EvaluationService")) ] # How many calls to the sweeper will be ignored. Used to # temporarily disable it in case of invalidate. self.avoid_next_sweepers = 0 self.add_executor(EvaluationExecutor(self)) self.start_sweeper(1200.0) self.add_timeout( self.check_workers_timeout, None, QueueService.WORKER_TIMEOUT_CHECK_TIME.total_seconds(), immediately=False) self.add_timeout( self.check_workers_connection, None, QueueService.WORKER_CONNECTION_CHECK_TIME.total_seconds(), immediately=False)