def execute_with_retry(ops, retry_cnt=6):
    ops = list(ops) if isinstance(ops, (list, tuple)) else [ops]
    i = 0
    while i < retry_cnt:
        try:
            return execute(ops)
        except (PsError) as e:
            i = i + 1
            if i == retry_cnt:
                raise e
            print('execute fail retry cnt[%d]' % i)
            time.sleep(30)
            restart_client()
Exemple #2
0
 def start_model_server(self, model_server):
     if model_server.name() != self._task_name:
         return
     model_server = get_model_server_by_name(self._task_name)
     model_server_adapter = ModelServerAdapter(
         self._zk_addr,
         self._model_server.index(self._task_name) + 1, self._task_index,
         model_server.forward_cache, model_server.backward_cache,
         model_server.dtype())
     model_server_adapter.init()
     model_server.init_server(model_server_adapter)
     with model_scope("ams_gear_forward"):
         while True:
             try:
                 connect_to_client(self._zk_addr, '')
                 if variable_registers() is not None:
                     execute(variable_registers())
                     execute(global_initializers())
                 break
             except PsError as e:
                 traceback.print_exc()
                 time.sleep(10)
     model_server.run_server()
     while True:
         try:
             while True:
                 print "RESTARTING CLIENT"
                 if restart_client():
                     break
                 time.sleep(10)
             execute_loop_wait()
         except PsError as e:
             traceback.print_exc()
             time.sleep(10)
 def _restart_client(self, retry_cnt=3, interval=10):
     i = 0
     while (i < retry_cnt and not restart_client()):
         i += 1
         time.sleep(interval)
         if i >= 3:
             raise InternalError("restart client failed")