def execute_with_retry(ops, retry_cnt=6): ops = list(ops) if isinstance(ops, (list, tuple)) else [ops] i = 0 while i < retry_cnt: try: return execute(ops) except (PsError) as e: i = i + 1 if i == retry_cnt: raise e print('execute fail retry cnt[%d]' % i) time.sleep(30) restart_client()
def start_model_server(self, model_server): if model_server.name() != self._task_name: return model_server = get_model_server_by_name(self._task_name) model_server_adapter = ModelServerAdapter( self._zk_addr, self._model_server.index(self._task_name) + 1, self._task_index, model_server.forward_cache, model_server.backward_cache, model_server.dtype()) model_server_adapter.init() model_server.init_server(model_server_adapter) with model_scope("ams_gear_forward"): while True: try: connect_to_client(self._zk_addr, '') if variable_registers() is not None: execute(variable_registers()) execute(global_initializers()) break except PsError as e: traceback.print_exc() time.sleep(10) model_server.run_server() while True: try: while True: print "RESTARTING CLIENT" if restart_client(): break time.sleep(10) execute_loop_wait() except PsError as e: traceback.print_exc() time.sleep(10)
def _restart_client(self, retry_cnt=3, interval=10): i = 0 while (i < retry_cnt and not restart_client()): i += 1 time.sleep(interval) if i >= 3: raise InternalError("restart client failed")