def start_heartbeat(self):
     while self.running:
         try:
             # do heartbeat
             self.storage.clear_dead_members(self.ttl_ms)
             self.storage.update_member(self.server_uri, self.uuid)
             self.living_members = self.storage.list_living_members(self.ttl_ms)
             if not self.notified_others_after_start:
                 for member in self.living_members:
                     if member.server_uri == self.server_uri:
                         continue
                     channel = grpc.insecure_channel(member.server_uri)
                     self.member_connections[member.server_uri] = \
                         HighAvailabilityManagerStub(channel)
                     try:
                         self.member_connections[member.server_uri].notifyNewMember(
                             NotifyNewMemberRequest(member=member_to_proto(
                                 Member(1, self.server_uri, int(time.time_ns() / 1000000))
                             )))
                     except grpc.RpcError:
                         logging.error("Notify new member to '%s' failed." % member.server_uri,
                                       exc_info=True)
                 self.notified_others_after_start = True
         except Exception:
             logging.error("Exception thrown when send heartbeat to the HA storage.",
                           exc_info=True)
         sleep_and_detecting_running(self.ttl_ms / 2, lambda: self.running)
Ejemplo n.º 2
0
    def _replace_aiflow_stubs(self, server_uri):
        high_availability_channel = grpc.insecure_channel(server_uri)
        high_availability_stub = self._wrap_aiflow_rpcs(
            HighAvailabilityManagerStub(high_availability_channel), server_uri,
            "high_availability_stub")
        self.high_availability_stub = high_availability_stub

        metadata_channel = grpc.insecure_channel(server_uri)
        metadata_store_stub = self._wrap_aiflow_rpcs(
            MetadataServiceStub(metadata_channel), server_uri,
            "metadata_store_stub")
        self.metadata_store_stub = metadata_store_stub

        model_center_channel = grpc.insecure_channel(server_uri)
        model_center_stub = self._wrap_aiflow_rpcs(
            ModelCenterServiceStub(model_center_channel), server_uri,
            "model_center_stub")
        self.model_center_stub = model_center_stub

        deploy_channel = grpc.insecure_channel(server_uri)
        deploy_stub = self._wrap_aiflow_rpcs(DeployServiceStub(deploy_channel),
                                             server_uri, "deploy_stub")
        self.deploy_stub = deploy_stub

        metric_channel = grpc.insecure_channel(server_uri)
        metric_stub = self._wrap_aiflow_rpcs(MetricServiceStub(metric_channel),
                                             server_uri, "metric_stub")
        self.metric_stub = metric_stub
Ejemplo n.º 3
0
 def __init__(self,
              server_uri=_SERVER_URI,
              notification_service_uri=None,
              project_config: ProjectConfig = None):
     MetadataClient.__init__(self, server_uri)
     ModelCenterClient.__init__(self, server_uri)
     DeployClient.__init__(self, server_uri)
     MetricClient.__init__(self, server_uri)
     self.enable_ha = False
     self.list_member_interval_ms = 5000
     self.retry_interval_ms = 1000
     self.retry_timeout_ms = 10000
     if project_config is not None:
         if server_uri is None:
             server_uri = project_config.get_master_uri()
         if notification_service_uri is None:
             notification_service_uri = project_config.get_notification_service_uri(
             )
         self.enable_ha = project_config.get_enable_ha()
         self.list_member_interval_ms = project_config.get_list_member_interval_ms(
         )
         self.retry_interval_ms = project_config.get_retry_interval_ms()
         self.retry_timeout_ms = project_config.get_retry_timeout_ms()
     if notification_service_uri is None:
         NotificationClient.__init__(
             self,
             server_uri,
             enable_ha=self.enable_ha,
             list_member_interval_ms=self.list_member_interval_ms,
             retry_interval_ms=self.retry_interval_ms,
             retry_timeout_ms=self.retry_timeout_ms)
     else:
         NotificationClient.__init__(
             self,
             notification_service_uri,
             enable_ha=self.enable_ha,
             list_member_interval_ms=self.list_member_interval_ms,
             retry_interval_ms=self.retry_interval_ms,
             retry_timeout_ms=self.retry_timeout_ms)
     if self.enable_ha:
         server_uris = server_uri.split(",")
         self.living_aiflow_members = []
         self.current_aiflow_uri = None
         last_error = None
         for server_uri in server_uris:
             channel = grpc.insecure_channel(server_uri)
             high_availability_stub = HighAvailabilityManagerStub(channel)
             try:
                 request = ListMembersRequest(timeout_seconds=0)
                 response = high_availability_stub.listMembers(request)
                 if response.return_code == ReturnStatus.CALL_SUCCESS:
                     self.living_aiflow_members = [
                         proto_to_member(proto).server_uri
                         for proto in response.members
                     ]
                 else:
                     raise Exception(response.return_msg)
                 self.current_aiflow_uri = server_uri
                 self.high_availability_stub = high_availability_stub
                 break
             except grpc.RpcError as e:
                 last_error = e
         if self.current_aiflow_uri is None:
             raise Exception(
                 "No available aiflow server uri!") from last_error
         self.aiflow_ha_change_lock = threading.Lock()
         self.aiflow_ha_running = True
         self._replace_aiflow_stubs(self.current_aiflow_uri)
         self.list_aiflow_member_thread = threading.Thread(
             target=self._list_aiflow_members, daemon=True)
         self.list_aiflow_member_thread.start()