def poll_controllers(self): for worker in Worker.objects.all(): controller = rpc.getThriftControllerClient(worker.lan_dns) if controller: stats = controller.stats() print controller,stats # TODO update worker stats. else: print "could not connect to controller on %s" % worker
def poll_controllers(self): for worker in Worker.objects.all(): controller = rpc.getThriftControllerClient(worker.lan_dns) if controller: stats = controller.stats() print controller, stats # TODO update worker stats. else: print "could not connect to controller on %s" % worker
def update_status(self, instance_name): worker = Worker.objects.filter(instance_name=instance_name)[0] if worker.status == Worker.States.created: conn = self.ec2_connection() reservations = conn.get_all_instances([instance_name]) instance = reservations[0].instances[0] if instance.state == 'running': logger.info('Worker %s is now initializing: %s', worker.instance_name, instance.public_dns_name) worker.status = Worker.States.initializing worker.lan_dns = instance.private_dns_name worker.wan_dns = instance.public_dns_name worker.save() mail.report_new_worker(worker) else: logger.debug('Worker %s is still reporting as %s', worker.instance_name, instance.state) return WorkerManager.WORKER_NOT_READY if worker.status == Worker.States.initializing: logger.debug('Trying to update controller on %s', worker.instance_name) if self.update_worker(worker.lan_dns): worker.status = Worker.States.updating worker.save() logger.info('Worker %s is now updating', worker.instance_name) return WorkerManager.WORKER_UPDATING else: return WorkerManager.WORKER_INITIALIZING if worker.status == Worker.States.updating: logger.debug('Checking if controller is up on %s', worker.instance_name) try: controller = rpc.getThriftControllerClient(worker.lan_dns) controller.get_worker_load_stats() worker.status = Worker.States.controllable worker.save() logger.info('Worker %s is now controllable', worker.instance_name) return WorkerManager.WORKER_CONTROLLABLE except Exception, e: if isinstance( e, TTransport.TTransportException ) and e.type == TTransport.TTransportException.NOT_OPEN: logger.info('Controller on worker %s not responding yet.', worker.lan_dns) else: logger.exception( 'Unexpected exception while checking worker %s', worker.lan_dns) return WorkerManager.WORKER_UPDATING
def _delete_deploy(self, deploy): logger.debug('Deleting deploy: %r', deploy) if deploy.base_port: try: controller = rpc.getThriftControllerClient(deploy.worker.lan_dns) controller.kill_engine(deploy.index.code,deploy.base_port) except: logger.exception('Failed when attempting to kill the IndexEngine for the deploy %s', deploy) index = deploy.index deploy.delete() if index.deleted and index.deploys.count() == 0: index.delete()
def _handle_created(self, deploy): if not deploy.worker.is_ready(): logger.info('Waiting to initialize index "%s" (%s) on %s:%d. The worker is not ready yet', deploy.index.name, deploy.index.code, deploy.worker.instance_name, deploy.base_port) return DeployManager.WORKER_NOT_READY_YET # else controller = rpc.getThriftControllerClient(deploy.worker.lan_dns) json_config = {} json_config['functions'] = deploy.index.get_functions_dict() # there should be exactly one recovery service recovery_service = Service.objects.get(name='recovery') # log based storage json_config['log_based_storage'] = True json_config['log_server_host'] = recovery_service.host json_config['log_server_port'] = recovery_service.port json_config.update(deploy.index.configuration.get_data()) proposed_port = self._get_free_port(deploy) json_config['base_port'] = proposed_port json_config['index_code'] = deploy.index.code analyzer_config = deploy.index.get_json_for_analyzer() if analyzer_config: json_config['analyzer_config'] = analyzer_config logger.info('Initializing index "%s" (%s) on %s:%d', deploy.index.name, deploy.index.code, deploy.worker.instance_name, proposed_port) # override xmx with the one defined for this deploy json_config['xmx'] = deploy.effective_xmx logger.debug("deploy: %r\n----\nindex: %r\n----\nstart args: %r", deploy, deploy.index, json_config) started_ok = controller.start_engine(json.dumps(json_config)) if started_ok: qs = Deploy.objects.filter(id=deploy.id) qs.update(base_port=proposed_port) qs = Deploy.objects.filter(id=deploy.id,index__deleted=False) qs.update(status=Deploy.States.initializing) return DeployManager.INDEX_INITIALIZING else: logger.warn('Deploy failed starting. Will try again in next round.'); return
def update_worker(self, dns): try: controller = rpc.getThriftControllerClient(dns) host = socket.gethostbyname_ex(socket.gethostname())[0] retcode = controller.update_worker(host) if retcode == 0: try: logger.debug('Worker %s updated. Restarting...', dns) controller.restart_controller() logger.warn("Restart controller didn't throw an exception. Did it restart?") except TTransport.TTransportException: # restart will always fail pass except Exception, e: if isinstance(e, TTransport.TTransportException) and e.type == TTransport.TTransportException.NOT_OPEN: logger.info('Controller on worker %s not responding yet.', dns) else: logger.exception('Unexpected exception while updating worker %s', dns) return False
def update_status(self, instance_name): worker = Worker.objects.filter(instance_name=instance_name)[0] if worker.status == Worker.States.created: conn = self.ec2_connection() reservations = conn.get_all_instances([instance_name]) instance = reservations[0].instances[0] if instance.state == 'running': logger.info('Worker %s is now initializing: %s', worker.instance_name, instance.public_dns_name) worker.status = Worker.States.initializing worker.lan_dns = instance.private_dns_name worker.wan_dns = instance.public_dns_name worker.save() mail.report_new_worker(worker) else: logger.debug('Worker %s is still reporting as %s', worker.instance_name, instance.state) return WorkerManager.WORKER_NOT_READY if worker.status == Worker.States.initializing: logger.debug('Trying to update controller on %s', worker.instance_name) if self.update_worker(worker.lan_dns): worker.status = Worker.States.updating worker.save() logger.info('Worker %s is now updating', worker.instance_name) return WorkerManager.WORKER_UPDATING else: return WorkerManager.WORKER_INITIALIZING if worker.status == Worker.States.updating: logger.debug('Checking if controller is up on %s', worker.instance_name) try: controller = rpc.getThriftControllerClient(worker.lan_dns) controller.get_worker_load_stats() worker.status = Worker.States.controllable worker.save() logger.info('Worker %s is now controllable', worker.instance_name) return WorkerManager.WORKER_CONTROLLABLE except Exception, e: if isinstance(e, TTransport.TTransportException) and e.type == TTransport.TTransportException.NOT_OPEN: logger.info('Controller on worker %s not responding yet.', worker.lan_dns) else: logger.exception('Unexpected exception while checking worker %s', worker.lan_dns) return WorkerManager.WORKER_UPDATING
def update_worker(self, dns): try: controller = rpc.getThriftControllerClient(dns) host = socket.gethostbyname_ex(socket.gethostname())[0] retcode = controller.update_worker(host) if retcode == 0: try: logger.debug('Worker %s updated. Restarting...', dns) controller.restart_controller() logger.warn( "Restart controller didn't throw an exception. Did it restart?" ) except TTransport.TTransportException: # restart will always fail pass except Exception, e: if isinstance( e, TTransport.TTransportException ) and e.type == TTransport.TTransportException.NOT_OPEN: logger.info('Controller on worker %s not responding yet.', dns) else: logger.exception( 'Unexpected exception while updating worker %s', dns) return False
# # This script is used from the upgrade_frontend.sh # to issue commands to every worker so they can update # their nebu installations from this frontend and # restart their controllers. # # author: santip # from nebu.models import Worker import rpc import socket from thrift.transport import TTransport for w in Worker.objects.all(): print 'Upgrading worker %d at %s' % (w.id, w.wan_dns) dns = w.lan_dns controller = rpc.getThriftControllerClient(dns) host = socket.gethostbyname_ex(socket.gethostname())[0] retcode = controller.update_worker(host) if retcode == 0: try: print 'Worker %s updated. Restarting...' % dns controller.restart_controller() print "Restart controller didn't throw an exception. Did it restart?" except TTransport.TTransportException: # restart will always fail pass print 'Done'